if (!is.na(fig_dir)) system(sprintf("mkdir -p %s", fig_dir))

Barcode collision

plot_collision <- function(df, threshold=0.95) {
  xmax <- max(df$HUMAN_READS * 1.1)
  ymax <- max(df$MOUSE_READS * 1.1)
  df %>%
  mutate(CELL=case_when(HUMAN_PERCENT >= threshold ~ "Human",
                        MOUSE_PERCENT >= threshold ~ "Mouse",
                        TRUE ~ "Collision")) %>%
    mutate(CELL=factor(CELL, levels=c("Human", "Collision", "Mouse"))) %>%
    ggplot(aes(HUMAN_READS, MOUSE_READS, color=CELL)) +
    geom_point(alpha=0.3) +
    xlab("# of human unique insertions") +
    ylab("# of mouse unique insertions") +
    scale_color_manual(values=c("Human"="orangered", "Mouse"="dodgerblue", "Collision"="grey80")) +
    scale_x_continuous(labels=scales::format_format(scientific = FALSE),
                       breaks=scales::pretty_breaks(n=4)) +
    scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                       breaks=scales::pretty_breaks(n=4)) +
    coord_cartesian(xlim=c(0, xmax), ylim=c(0, ymax)) +
    theme_bw() +
    theme(legend.title=element_blank(),
          legend.justification=c(1, 1),
          legend.position=c(0.95, 0.95),
          legend.background=element_rect(colour="black", size=0.5),
          axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank())
}

Fig 1C

df <- read_delim(slfile("lib1-6forCollision.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")

# df %>%
#   mutate(CELL=case_when(HUMAN_PERCENT >= 0.9 ~ "Human",
#                         MOUSE_PERCENT >= 0.9 ~ "Mouse",
#                         TRUE ~ "Collision")) %>%
#   mutate(CELL=factor(CELL, levels=c("Human", "Collision", "Mouse"))) %>%
#   count(CELL)

p <- plot_collision(df)
p

if (!is.na(fig_dir)) ggsave("fig1C.pdf", p, "pdf", fig_dir, height=2.5, width=3)

# Summary stats used in text
threshold <- 0.95
df %>%
  mutate(CELL=case_when(HUMAN_PERCENT >= threshold ~ "Human",
                        MOUSE_PERCENT >= threshold ~ "Mouse",
                        TRUE ~ "Collision")) %>%
  count(CELL)
## # A tibble: 3 x 2
##   CELL          n
##   <chr>     <int>
## 1 Collision    48
## 2 Human       719
## 3 Mouse       315
df %>%
  mutate(CELL=case_when(HUMAN_PERCENT >= threshold ~ "Human",
                        MOUSE_PERCENT >= threshold ~ "Mouse",
                        TRUE ~ "Collision")) %>%
  group_by(CELL) %>%
  summarise(HUMAN_PERCENT=median(HUMAN_PERCENT), MOUSE_PERCENT=median(MOUSE_PERCENT))
## # A tibble: 3 x 3
##   CELL      HUMAN_PERCENT MOUSE_PERCENT
##   <chr>             <dbl>         <dbl>
## 1 Collision        0.654        0.346  
## 2 Human            0.998        0.00243
## 3 Mouse            0.0129       0.987

Number of unique reads

human_mouse_boxplot <- function(human_df, mouse_df) {
  bind_rows(human_df %>% mutate(CELL="Human"),
            mouse_df %>% mutate(CELL="Mouse")) %>%
    filter(TN5_TYPE == "Concentrated") %>%
    ggplot(aes(CELL, N_UNIQ_READS, fill=CELL)) +
    geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5) +
    scale_fill_manual(values=c("Human"="orangered", "Mouse"="dodgerblue")) +
    xlab("") +
    ylab("# of unique insertions") +
    theme_bw() +
    scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                       breaks=scales::pretty_breaks(n=4)) +
    theme(legend.position='none',
          axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank())
}

unique_reads_boxplot <- function(df) {
  df %>%
    ggplot(aes(TN5_TYPE, N_UNIQ_READS, fill=TN5_TYPE)) +
    geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5) +
    scale_fill_manual(values=c("Concentrated"="orangered", "Diluted"="dodgerblue")) +
    xlab("Transposon concentration") +
    ylab("# of human unique insertions") +
    theme_bw() +
    scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                       breaks=scales::pretty_breaks(n=4)) +
    theme(legend.position='none',
          axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank())
}

Fig 1D

yi129_h_m_df <- read_delim(slfile("yi129_single_cell_summary_all.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")

yi129_human_df <- read_delim(slfile("yi129_single_cell_summary_bed.txt"), col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
  mutate(N_UNIQ_READS=y * 2) %>%
  filter(CELL_ID %in% yi129_h_m_df$CELL_ID[yi129_h_m_df$HUMAN_PERCENT >= 0.9]) %>%
  separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
                            TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))

yi129_mouse_df <- read_delim(slfile("yi129_single_cell_summary_mouse_bed.txt"), col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
  mutate(N_UNIQ_READS=y * 2) %>% # because paired reads and this is only R1
  filter(CELL_ID %in% yi129_h_m_df$CELL_ID[yi129_h_m_df$MOUSE_PERCENT >= 0.9]) %>%
  separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
                            TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))

p <- human_mouse_boxplot(yi129_human_df, yi129_mouse_df)
p

if (!is.na(fig_dir)) ggsave("fig1D.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)

Fig S1A

yi128_h_m_df <- read_delim(slfile("yi128_single_cell_summary_all.txt"),
  col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"),
  delim=" ")

yi128_human_df <- read_delim(slfile("yi128_single_cell_summary_bed.txt"),
  col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
  mutate(N_UNIQ_READS=y * 2) %>%
  filter(CELL_ID %in% yi128_h_m_df$CELL_ID[yi128_h_m_df$HUMAN_PERCENT >= 0.9]) %>%
  separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
                            TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))

yi128_mouse_df <- read_delim(slfile("yi128_single_cell_summary_mouse_bed.txt"),
  col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
  mutate(N_UNIQ_READS=y * 2) %>%
  filter(CELL_ID %in% yi128_h_m_df$CELL_ID[yi128_h_m_df$MOUSE_PERCENT >= 0.9]) %>%
  separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
                            TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))

p <- human_mouse_boxplot(yi128_human_df, yi128_mouse_df)
p

if (!is.na(fig_dir)) ggsave("figS1A1.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)

p <- unique_reads_boxplot(yi128_human_df)
p

if (!is.na(fig_dir)) ggsave("figS1A2.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)

Fig S1B

p <- unique_reads_boxplot(yi129_human_df)
p

if (!is.na(fig_dir)) ggsave("figS1B.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)

Fig S1C, D

df_140 <- read_delim(slfile("yi140_GAACCG.single_cells_all.txt"), col_names=FALSE, delim=" ")
df_141 <- read_delim(slfile("yi141_AGGATG.single_cells_all.txt"), col_names=FALSE, delim=" ")
df_144 <- read_delim(slfile("yi144_single_cell_summary_all.txt"), col_names=FALSE, delim=" ")
df_145 <- read_delim(slfile("yi145_single_cell_summary_all.txt"), col_names=FALSE, delim=" ")

df <- bind_rows(
  bind_rows(df_140 %>% mutate(LIB="yi140, 141"),
            df_141 %>% mutate(LIB="yi140, 141")) %>%
    top_n(191, X4),
  bind_rows(df_144 %>% mutate(LIB="yi144, 145"),
            df_145 %>% mutate(LIB="yi144, 145")) %>%
    top_n(2200, X4)
)
  
p1 <- df %>%
  filter(LIB == "yi140, 141") %>%
  ggplot(aes(LIB, X4 * 2)) +
  geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5, fill="orangered") +
  xlab("") +
  ylab("# of human unique insertions") +
  theme_bw() +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                     breaks=scales::pretty_breaks(n=4)) +
  theme(legend.position='none',
        axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

p2 <- df %>%
  filter(LIB == "yi144, 145") %>%
  ggplot(aes(LIB, X4 * 2)) +
  geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5, fill="dodgerblue") +
  xlab("") +
  ylab("# of human unique insertions") +
  theme_bw() +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                     breaks=scales::pretty_breaks(n=4)) +
  theme(legend.position='none',
        axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

grid.arrange(p1, p2, nrow=1)

if (!is.na(fig_dir)) {
  ggsave("figS1C.pdf", p1, "pdf", fig_dir, height=2.5, width=2)
  ggsave("figS1D.pdf", p2, "pdf", fig_dir, height=2.5, width=2)
}

Fig S1E

yi174_h_df <- read_delim(slfile("yi174.human.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")

cols <- c("CELL_ID", "x", "UNIQ_READS", "DEPTH", "z")
yi174_human_df <- bind_rows(
  read_delim(slfile("yi174_ACCACT.single_cells_all.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_AGGATG.single_cells_all.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_CGCTTG.single_cells_all.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_GTCCTA.single_cells_all.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_TTCTCC.single_cells_all.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_TTTCGC.single_cells_all.txt"), col_names=cols, delim=" ")
) %>%
  mutate(N_UNIQ_READS=UNIQ_READS * 2) %>%
  filter(CELL_ID %in% yi174_h_df$CELL_ID[yi174_h_df$HUMAN_PERCENT >= 0.9]) %>%
  tidyr::extract(CELL_ID, c("TN5"), ".*\\.(........).*", remove=FALSE) %>%
  mutate(TN5_TYPE=case_when(TN5 %in% c("TGATATTG", "GATCCCGT", "CTCGATTA") ~ "Concentrated",
                            TRUE ~ "Diluted"))

yi174_m_df <- bind_rows(
  read_tsv(slfile("yi174_ACCACT.mouse.txt"), col_names=c("CELL_ID")),
  read_tsv(slfile("yi174_AGGATG.mouse.txt"), col_names=c("CELL_ID")),
  read_tsv(slfile("yi174_CGCTTG.mouse.txt"), col_names=c("CELL_ID")),
  read_tsv(slfile("yi174_GTCCTA.mouse.txt"), col_names=c("CELL_ID")),
  read_tsv(slfile("yi174_TTCTCC.mouse.txt"), col_names=c("CELL_ID")),
  read_tsv(slfile("yi174_TTTCGC.mouse.txt"), col_names=c("CELL_ID"))
)

cols <- c("CELL_ID", "x", "UNIQ_READS", "z")

yi174_mouse_df <- bind_rows(
  read_delim(slfile("yi174_ACCACT.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_AGGATG.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_CGCTTG.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_GTCCTA.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_TTCTCC.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
  read_delim(slfile("yi174_TTTCGC.mouse.single_cells_bed.txt"), col_names=cols, delim=" ")
) %>%
  mutate(N_UNIQ_READS=UNIQ_READS * 2) %>%
  filter(CELL_ID %in% yi174_m_df$CELL_ID) %>%
  tidyr::extract(CELL_ID, c("TN5"), ".*\\.(........).*", remove=FALSE) %>%
  mutate(TN5_TYPE=case_when(TN5 %in% c("TGATATTG", "GATCCCGT", "CTCGATTA") ~ "Concentrated",
                            TRUE ~ "Diluted"))

p <- unique_reads_boxplot(yi174_human_df)
p

if (!is.na(fig_dir)) ggsave("figS1E1.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
p <- human_mouse_boxplot(yi174_human_df, yi174_mouse_df)
p

if (!is.na(fig_dir)) ggsave("figS1E2.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)

Fig S1F

depth_beds <- list.files(slfile("read_depth"), full.names=TRUE)
names(depth_beds) <- list.files(slfile("read_depth"))
depth_df <- map_df(depth_beds, function(x) {
  read_delim(x, col_names=FALSE, delim=" ")
}, .id="LIB")

depth_df <- depth_df %>%
  mutate(RUN=gsub("_.*", "", LIB)) %>%
  mutate(UNIQ_READS=X4 * 2) %>%
  mutate(LIB=gsub("\\.single.*|_merge.*", "", LIB)) %>%
  mutate(BC=gsub(".*_", "", LIB)) %>%
  group_by(RUN) %>%
  mutate(BC_ID=as.numeric(as.factor(BC))) %>%
  ungroup() %>%
  mutate(LIB_ID=paste(RUN, BC_ID, sep=".")) %>%
  mutate(LIB_ID=factor(LIB_ID, levels=unique(.$LIB_ID)))

p <- ggplot(depth_df, aes(LIB_ID, UNIQ_READS, fill=grepl("yi19", RUN))) +
    geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5, outlier.size=0.5) +
    scale_fill_manual(values=c("orangered", "dodgerblue")) +
    xlab("") +
    ylab("# of unique insertions") +
    theme_bw() +
    scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                       breaks=scales::pretty_breaks(n=4), limits=c(0, 2500000)) +
    theme(legend.position='none',
          axis.line = element_line(colour = "black"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1))

p

if (!is.na(fig_dir)) ggsave("figS1F.pdf", p, "pdf", fig_dir, height=4, width=7)

Fig S1G

Extrapolation through down sampling to infer relationship between number of unique reads and sequencing depth.

celltype_annot <- read.table(slfile("barcode_tn5.txt"), stringsAsFactors=FALSE, header=TRUE)

ds_df1 <- read.table(slfile("downsample_summary.tab"), stringsAsFactors=FALSE) %>%
  dplyr::rename(N_READS=V1, FILE=V2) %>%
  filter(FILE != "total" & !grepl("\\.TCCTTGTG", FILE)) %>%
  mutate(TYPE=case_when(grepl("uniq", FILE) ~ "UNIQ",
                        TRUE ~ "TOTAL"),
         FILE=gsub(".uniq", "", FILE)) %>%
  tidyr::extract(FILE, c("CELL_ID", "FRAC"), "(.*).R1.human.Qgt0.(.*).bed", remove=FALSE) %>%
  spread(TYPE, N_READS) %>%
  mutate(DOWNSAMPLE=as.numeric(FRAC) / 10) %>%
  mutate(RATIO=TOTAL / UNIQ) %>%
  separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  left_join(celltype_annot, by="TN5") %>%
  mutate(UNIQ = UNIQ * 2) # because these are read pairs

md1 <- lm(UNIQ ~ log10(RATIO), data=ds_df1)

pred1 <- predict(md1, ds_df1)
pred_intv1 <- predict(md1, interval="predict")
pred_df1 <- ds_df1 %>%
  mutate(FIT=pred_intv1[, "fit"],
         LOWER=pred_intv1[, "lwr"],
         UPPER=pred_intv1[, "upr"])

predict(md1, newdata=data.frame("RATIO"=5))
##       1 
## 1852515
predict(md1, newdata=data.frame("RATIO"=10))
##       1 
## 2624546
ds_df2 <- read.table(slfile("yi140_GAACCG_downsample_summary.tab"), stringsAsFactors=FALSE) %>%
  bind_rows(read.table(slfile("yi141_AGGATG_downsample_summary.tab"), stringsAsFactors=FALSE)) %>%
  dplyr::rename(N_READS=V1, FILE=V2) %>%
  mutate(FILE=gsub("downsample/", "", FILE)) %>%
  filter(FILE != "total" & !grepl("uniq[1-9]", FILE)) %>%
  mutate(TYPE=case_when(grepl("uniq", FILE) ~ "UNIQ",
                        TRUE ~ "TOTAL"),
         FILE=gsub(".uniq", "", FILE)) %>%
  tidyr::extract(FILE, c("CELL_ID", "FRAC"), "(.*).R1.human.Qgt0.(.*).bed", remove=FALSE) %>%
  spread(TYPE, N_READS) %>%
  mutate(DOWNSAMPLE=as.numeric(FRAC) / 10) %>%
  mutate(RATIO=TOTAL / UNIQ) %>%
  separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  left_join(celltype_annot, by="TN5") %>%
  mutate(UNIQ = UNIQ * 2) # because these are read pairs

md2 <- lm(UNIQ ~ log10(RATIO), data=ds_df2)

pred2 <- predict(md2, ds_df2)
pred_intv2 <- predict(md2, interval="predict")
pred_df2 <- ds_df2 %>%
  mutate(FIT=pred_intv2[, "fit"],
         LOWER=pred_intv2[, "lwr"],
         UPPER=pred_intv2[, "upr"])

predict(md2, newdata=data.frame("RATIO"=1.78))
##       1 
## 1521039
predict(md2, newdata=data.frame("RATIO"=5))
##       1 
## 4231109
predict(md2, newdata=data.frame("RATIO"=10))
##       1 
## 6049887
p <- bind_rows(pred_df1 %>% mutate(CAT="yi129"), pred_df2 %>% mutate(CAT="yi140, 141")) %>%
ggplot(aes(RATIO, UNIQ, color=CAT)) +
  geom_point(alpha=0.05) +
  geom_line(aes(RATIO, FIT, color=CAT), size=1, alpha=1) +
  scale_color_manual(values=c("orangered", "dodgerblue"), name="Library") +
  xlab("Depth") +
  ylab("# of unique insertions") +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE)) +
  # geom_ribbon(aes(x=RATIO, ymin=LOWER, ymax=UPPER), alpha=0.3, fill="orangered") +
  theme_classic() +
  theme(legend.position=c(1, 1), legend.justification=c(1, 1))
p

if (!is.na(fig_dir)) ggsave("figS1G.pdf", p, "pdf", fig_dir, height=2.5, width=3.5)

Fig S1H

df <- read_delim(slfile("lib1-6forCollision.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")

df <- df %>%
  mutate(CELL=case_when(HUMAN_PERCENT >= 0.95 ~ "Human",
                        MOUSE_PERCENT >= 0.95 ~ "Mouse",
                        TRUE ~ "Collision")) %>%
  mutate(CELL=factor(CELL, levels=c("Human", "Collision", "Mouse")))

fen<-df %>% 
  filter(CELL=="Collision") 

median(fen$MOUSE_PERCENT)
## [1] 0.3461845
prop_files <- list.files("collision/", ".*prop.txt", full.names=TRUE)
names(prop_files) <- gsub(".chr_prop.txt", "", basename(prop_files))

prop_df <- map_df(prop_files, function(f) {
  read_delim(f, col_names=c("CHROM", "CHROM_SIZE", "CHROM_UNIQ_READS", "CHROM_READS_FRAC", "TOTAL_UNIQ_READS", "RATIO"), delim=" ")
}, .id="CELL_ID") %>%
  left_join(df, by="CELL_ID") %>%
  mutate(HUMAN_CHROM_READS_FRAC=CHROM_UNIQ_READS / HUMAN_READS)

qt1_files <- list.files("collision/", ".idx", full.names=TRUE)
names(qt1_files) <- gsub(".gt1.idx", "", basename(qt1_files))
qt1_df <- map_df(qt1_files, function(f) {
  read_table(f, col_names="CHROM_UNIQ_READS") %>%
    mutate(CHROM=c(1:22, "X", "Y")) %>%
    mutate(HUMAN_READS=sum(CHROM_UNIQ_READS)) %>%
    mutate(HUMAN_CHROM_READS_FRAC=CHROM_UNIQ_READS / HUMAN_READS)
}, .id="CELL_ID")

prop_df <- prop_df %>%
  left_join(qt1_df, by=c("CELL_ID", "CHROM"), suffix=c("", "_QT1")) %>%
  filter(!CHROM %in% c("X", "Y")) %>%
  mutate(CHROM=factor(CHROM, levels=as.character(c(1:22))))

p<-ggplot(prop_df, aes(CHROM, HUMAN_CHROM_READS_FRAC)) +
  geom_boxplot(aes(fill=CELL), outlier.shape=NA) +
  scale_fill_manual(values=c("Human"="orangered", "Mouse"="dodgerblue", "Collision"="grey80")) +
  coord_cartesian(ylim=c(0, 0.25)) +
  labs(x="Chromosome", y="Human reads fraction for chromosome") +
  theme_bw() +
  theme(legend.title=element_blank(),
        legend.justification=c(1, 1),
        legend.position=c(0.95, 0.95),
        legend.background=element_rect(colour="black", size=0.5),
        axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())
p

if (!is.na(fig_dir)) ggsave("figS1H.pdf", p, "pdf", fig_dir, height=3, width=6)

Copy numbers

plot_chr <- function(df, chr_size) {
  chr_size <- mutate(chr_size, Chromosome=gsub("chr", "", Chromosome))
  chrs <- unique(chr_size$Chromosome)
  chr_levels <- chrs[order(as.numeric(chrs))]
  df$Chromosome <- factor(df$Chromosome, levels=chr_levels)
  
  chr_size <- chr_size %>%
    mutate(Chromosome=factor(Chromosome, levels=chr_levels)) %>%
    arrange(Chromosome) %>%
  mutate(Size_M=Size/1000000) %>%
  mutate(Cumsize=cumsum(Size_M) - Size_M)

  # add chromosome size to Start
  df_new <- df %>%
    arrange(Chromosome, Start) %>%
    filter(Chromosome != "M") %>%
    mutate(Start_M=Start/1000000) %>%
    mutate(Cumstart=0)

  for (i in seq_len(nrow(df_new))) {
    this_chr <- df_new$Chromosome[i]
    this_cumsize <- filter(chr_size, Chromosome == this_chr)$Cumsize
    df_new[i, "Cumstart"] <- df_new[i, "Start_M"] + this_cumsize
  }

  g <- ggplot(df_new, aes(Cumstart, Ratio)) +
    geom_vline(xintercept=chr_size$Cumsize, color="#f0f0f0") +
    geom_point(aes(color=Chromosome), size=0.2, alpha=0.8) +
    scale_color_manual(values=rep(c("orangered", "dodgerblue"), 12)) +
    xlim(c(0, NA)) +
    ylim(c(0, 5)) +
    ylab("Relative chromosome\ncopy number") +
    xlab("") +
    theme_bw() +
    theme(legend.position='none',
          axis.line = element_line(colour = "black"),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank())
  
  return(g)
}

Fig 1E

h_freec_df <- read_tsv(slfile("yi129_GGGTGC.GTTACGCGGCAGTAG.bam_ratio.txt"), guess_max=1e7)
h_chr_size <- read_tsv(slfile("hg19.len"), col_names=c("nouse", "Chromosome", "Size"), guess_max=1e7)

m_freec_df <- read_tsv(slfile("yi129_GGGTGC.GTTACGCGCAAAACG.bam_ratio.txt"), guess_max=1e7)
m_chr_size <- read_tsv(slfile("mm10.len"), col_names=c("Chromosome", "Size"), guess_max=1e7)

p <- plot_chr(h_freec_df, h_chr_size)
p

if (!is.na(fig_dir)) ggsave("fig1E1.pdf", p, "pdf", fig_dir, height=2, width=12)
p <- plot_chr(m_freec_df, m_chr_size)
p

if (!is.na(fig_dir)) ggsave("fig1E2.pdf", p, "pdf", fig_dir, height=2, width=12)

Fig 1F

aneuploidy_df <- read.table(slfile("lib144_aneuploidy.txt"), col.names=c("CELL",    "CHROM", "CHROM_SIZE", "CHROM_UNIQ_READS", "CHROM_READS_FRAC", "TOTAL_UNIQ_READS", "RATIO"), stringsAsFactors=FALSE) %>%
  bind_rows(read.table(slfile("lib145_aneuploidy.txt"), col.names=c("CELL", "CHROM", "CHROM_SIZE", "CHROM_UNIQ_READS", "CHROM_READS_FRAC", "TOTAL_UNIQ_READS", "RATIO"), stringsAsFactors=FALSE)) %>%
  mutate(CHROM=factor(CHROM, levels=c(as.character(1:22), "X", "Y"))) %>%
  mutate(CELL=gsub("./|.chr_prop.txt", "", CELL))

aneu_df <- aneuploidy_df %>%
  separate(CELL, c("LIB", "BC"), sep="_", remove=FALSE) %>%
  separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
  separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
  left_join(celltype_annot, by="TN5")

p <- aneu_df %>%
  # filter(CHROM != "Y") %>%
  ggplot(aes(CHROM, RATIO)) +
  # geom_jitter(aes(color=RATIO < 0.1), alpha=0.3) +
  geom_boxplot(aes(fill=CELL_TYPE), outlier.shape=NA, outlier.stroke=NA, size=0.5) +
  scale_fill_manual(values=c("orangered", "dodgerblue")) +
  facet_wrap(~CELL_TYPE, nrow=2) +
  coord_cartesian(ylim=c(0, 2)) +
  xlab("Chromosome") +
  ylab("Normalized copy number\nrelative to euploidy") +
  theme_classic() +
  theme(strip.background=element_blank(),
        legend.position='none')

p

if (!is.na(fig_dir)) ggsave("fig1F.pdf", p, "pdf", fig_dir, width=12, height=3)

Examples of crossover types

Fig 4A, haploid example

hb_example <- readRDS(slfile("yi190_ACGCGA.ATCGCGTTGGCTTGG.filtered.rds"))
hb_example_plot <- hb_example$haploidState %>%
  mutate(POS=POS / 1e6) %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  ggplot(aes(POS, FREQ_ALT)) +
  geom_point(alpha=0.05) +
  geom_line(aes(POS, ALT_STATE - (ALT_STATE - 0.5) / 5), color="red") +
  facet_wrap(~CHROM, nrow=2, scales="free_x") +
  xlab("Chromosome coordinates (M)") +
  ylab("Allele frequency\n(Spretus)") +
  theme_classic() +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                     breaks=scales::pretty_breaks(n=2)) +
  theme(legend.position='none',
        strip.text.x = element_text(margin = margin(0.1, 0, 0.1, 0, "cm")),
        strip.background = element_blank(),
        axis.text.x = element_text(angle=45, hjust = 1))
hb_example_plot

if (!is.na(fig_dir)) ggsave("fig4A.pdf", hb_example_plot, "pdf", fig_dir, width=10, height=2.5)

Fig 4B, M2 diploid, meiotic segragation example

me_example <- readRDS(slfile("yi193_CGCTTG.CATCAAGGCACTGTT.filtered.rds"))
me_example_plot <- me_example$m2DiploidState %>%
  mutate(POS=POS / 1e6) %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  ggplot(aes(POS, FREQ_ALT)) +
  geom_point(alpha=0.15) +
  geom_line(aes(POS, ALT_STATE), color="red") +
  facet_wrap(~CHROM, nrow=2, scales="free_x") +
  xlim(c(3.7, NA)) +
  xlab("Chromosome coordinates (M)") +
  ylab("Allele frequency\n(Spretus)") +
  theme_classic() +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                     breaks=scales::pretty_breaks(n=2)) +
  theme(legend.position='none',
        strip.text.x = element_text(margin = margin(0.1, 0, 0.1, 0, "cm")),
        strip.background = element_blank(),
        axis.text.x = element_text(angle=45, hjust = 1))

me_example_plot

if (!is.na(fig_dir)) ggsave("fig4B.pdf", me_example_plot, "pdf", fig_dir, width=10, height=2.5)

Fig 4C, M2 diploid, mitotic segragation example

mt_example <- readRDS(slfile("yi193_TATTCT.ACCATTTATCGGTTT.filtered.rds"))

mt_example_plot <- mt_example$m2DiploidState %>%
  # manually removed three artifacts, reflect it here
  filter(!(CHROM == "chr8" & between(POS, 50e6, 100e6) & ALT_STATE == 0.5)) %>%
  filter(!(CHROM == "chr16" & between(POS, 50e6, 75e6) & ALT_STATE == 0.5)) %>%
  filter(!(CHROM == "chr17" & between(POS, 30e6, 50e6) & ALT_STATE == 0.5)) %>%
  mutate(POS=POS / 1e6) %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  ggplot(aes(POS, FREQ_ALT)) +
  geom_point(alpha=0.15) +
  geom_line(aes(POS, ALT_STATE), color="red") +
  facet_wrap(~CHROM, nrow=2, scales="free_x") +
  xlab("Chromosome coordinates (M)") +
  ylab("Allele frequency\n(Spretus)") +
  theme_classic() +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                     breaks=scales::pretty_breaks(n=2)) +
  theme(legend.position='none',
        strip.text.x = element_text(margin = margin(0.1, 0, 0.1, 0, "cm")),
        strip.background = element_blank(),
        axis.text.x = element_text(angle=45, hjust = 1))

mt_example_plot

if (!is.na(fig_dir)) ggsave("fig4C.pdf", mt_example_plot, "pdf", fig_dir, width=10, height=2.5)

Meiotic and mitotic chromosomes distribution

m2_diploid <- read_tsv(slfile("m2.final.tab")) # used only here in fig 2
m2_upd <- read_tsv(slfile("m2.upd.tab")) # used only here in fig 2

hb_spret_df <- read_tsv(slfile("haploid.no0.hb.filtered.spret.tab"))
haploid_spret_upd<- read_tsv(slfile("haploid.upd.0312.spret.tab"))


hb_cast_df <- read_tsv(slfile("haploid.no0.hb.filtered.cast.tab"))
m2_cast_df <- read_tsv(slfile("m2diploid.concensus.mb.cast.tab"))
m2_cast_upd<-read_tsv(slfile("cast/m2.upd.cast.tab"))

Fig 4D, 4E

# count chromosomes per cell that are me or mt in m2 and hp separately
# me cells
me_per_cell_m2 <- m2_diploid %>% 
  filter(SEGREGATION %in% c("me", "hp")) %>%
  bind_rows(filter(m2_upd, STRAIN %in% c("bpNA"))) %>%
  distinct(CELL_ID, CHROM) %>%
  count(CELL_ID)
me_per_cell_m2_upd <- m2_upd %>% 
  filter(STRAIN %in% c("spret", "b6")) %>%
  distinct(CELL_ID, CHROM) %>% 
  count(CELL_ID)
drop0_per_cell <- m2_upd %>%
  filter(STRAIN %in% c("drop0", "spret/drop0")) %>%
  distinct(CELL_ID, CHROM) %>%
  count(CELL_ID) %>%
  dplyr::rename(n_drop0=n)
me_per_cell_m2_merge <- full_join(me_per_cell_m2, me_per_cell_m2_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>%
  full_join(drop0_per_cell, drop0_per_cell, by="CELL_ID") %>%
  replace_na(list(n_m2=0, n_m2_upd=0, n_drop0=0)) %>%
  mutate(SUM_m2=n_m2+n_m2_upd+n_drop0)

# mt cells
mt_per_cell_m2 <- m2_diploid %>% 
  filter(SEGREGATION=="mt") %>%
  distinct(CELL_ID, CHROM) %>% 
  count(CELL_ID)
mt_per_cell_m2_upd <- m2_upd %>% 
  filter(STRAIN %in% c("het")) %>%
  distinct(CELL_ID, CHROM) %>% 
  count(CELL_ID)
mt_per_cell_m2_merge <- full_join(mt_per_cell_m2, mt_per_cell_m2_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>% 
  replace_na(list(n_m2=0, n_m2_upd=0)) %>%
  mutate(SUM_m2=n_m2+n_m2_upd)

all_m2_merge <- full_join(mt_per_cell_m2_merge, me_per_cell_m2_merge, by="CELL_ID", suffix=c("_mt","_me")) %>% 
  replace_na(list(n_m2_mt=0, n_m2_me=0, n_m2_upd_mt=0, n_m2_upd_me=0, SUM_m2_mt=0, SUM_m2_me=0)) %>%
  mutate(SUM_m2 = SUM_m2_mt+SUM_m2_me) %>% 
  mutate(rate_m2_me = SUM_m2_me/(SUM_m2_mt+SUM_m2_me)) %>% 
  mutate(bp_m2 = n_m2_mt + n_m2_me) %>%
  mutate(TYPE=case_when(SUM_m2_mt >= 15 ~ "Mitotic cell",
                        SUM_m2_me >= 15 ~ "Meiotic cell",
                        TRUE ~ "Others"))

p_obs <- all_m2_merge %>%
  rowwise() %>%
  mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
  mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
  mutate(SORT1=n_mt + n_na) %>%
  arrange(n_mt, n_drop0, n_m2_upd_mt, desc(n_me)) %>%
  # arrange(SORT1, n_m2_upd_mt, n_m2_mt, n_drop0, n_m2_upd_me, desc(n_m2_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_drop0", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_drop0", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1", "n_drop0"="black")) +
  theme_void() +
  ggtitle("Observed")

# simulate null distribution with p = 0.7606
set.seed(12345)
simulate_m2 <- list()
simulate_m2$n_me <- rbinom(length(unique(all_m2_merge$CELL_ID)), 19, 0.7606)
simulate_m2$n_mt <- 19 - simulate_m2$n_me
simulate_df <- as.data.frame(simulate_m2)
simulate_df$CELL_ID <- seq_len(nrow(simulate_df))

p_null_1 <- simulate_df %>%
  arrange(desc(n_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
  theme_void() +
  ggtitle("Null: p=0.7606")

# simulate null distribution with p = 0.5
set.seed(12345)
simulate_m2 <- list()
simulate_m2$n_me <- rbinom(length(unique(all_m2_merge$CELL_ID)), 19, 0.5)
simulate_m2$n_mt <- 19 - simulate_m2$n_me
simulate_df <- as.data.frame(simulate_m2)
simulate_df$CELL_ID <- seq_len(nrow(simulate_df))

p_null_2 <- simulate_df %>%
  arrange(desc(n_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
  theme_void() +
  ggtitle("Null: p=0.5")

p_obs

p_null_1

p_null_2

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "fig4D_4E.pdf"), width=10, height=4)
  print(p_obs)
  print(p_null_1)
  print(p_null_2)
  dev.off()
}
## quartz_off_screen 
##                 2

Calculate # of crossovers in meiotic and mitotic cells only

all_m2_merge_biased <- all_m2_merge %>%
  filter(TYPE %in% c("Mitotic cell","Meiotic cell"))

m2_spret_df <- m2_diploid %>%
  filter(CELL_ID %in% all_m2_merge_biased$CELL_ID)
m2_spret_upd <- m2_upd %>%
  filter(CELL_ID %in% all_m2_merge_biased$CELL_ID)

dim(m2_spret_df)
## [1] 4184    9
dim(m2_spret_upd)
## [1] 825   3
# compare crossover number between B6/Cast and B6/Spret
dim(hb_cast_df)
## [1] 60755     7
dim(m2_cast_df)
## [1] 2246    9
co_per_cell_hp_cast <- hb_cast_df %>% 
  group_by(CELL_ID) %>% 
  summarise(count = n())

co_per_cell_hp_spret <- hb_spret_df %>% 
  group_by(CELL_ID) %>% 
  summarise(count = n())

co_per_cell_m2_spret <- m2_spret_df %>%
  group_by(CELL_ID) %>%
  summarise(count = n())

co_per_cell_m2_cast <- m2_cast_df %>%
  group_by(CELL_ID) %>%
  summarise(count = n())

wilcox.test(co_per_cell_hp_cast$count, co_per_cell_hp_spret$count)$p.value
## [1] 6.716958e-26
wilcox.test(co_per_cell_m2_cast$count, co_per_cell_m2_spret$count)$p.value
## [1] 1.889733e-10
mean(co_per_cell_hp_cast$count)/19
## [1] 0.5764614
mean(co_per_cell_hp_spret$count)/19
## [1] 0.6203437
mean(co_per_cell_m2_cast$count)/19
## [1] 1.027918
mean(co_per_cell_m2_spret$count)/19
## [1] 0.9175439
remove(m2_spret_df) # will read this file in later but it is the same
remove(m2_spret_upd) # will read this file in later but it is the same

Fig 4E

p <- all_m2_merge %>%
  rowwise() %>%
  mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
  mutate(SORT1=n_m2_mt + n_m2_upd_mt) %>%
  mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
  arrange(n_mt, n_drop0, n_m2_upd_mt, desc(n_me), n_m2_upd_me) %>%
  # arrange(SORT1, n_m2_upd_mt, n_m2_mt, n_drop0, n_m2_upd_me, desc(n_m2_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c(2,3,5,6,7)) %>%
  mutate(Category=factor(Category, levels=c("n_m2_me", "n_m2_upd_me", "n_drop0", "n_m2_mt", "n_m2_upd_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_m2_me"="mistyrose", "n_m2_upd_me"="firebrick1", "n_m2_mt"="darkolivegreen1", "n_m2_upd_mt"="deepskyblue", "n_drop0"="black")) +
  theme_void() +
  ggtitle("fig 4E")

p

if (!is.na(fig_dir)) ggsave("fig4E.pdf", p, "pdf", fig_dir, width=10, height=4)

Fig 5

B6/Cast, same format as Fig 2

n_mt_cast <- read_tsv(slfile("cast/number_mt.cast.tab"))
n_class1 <- c(n_mt_cast$class1_1, n_mt_cast$class1_2) %>% na.omit() %>% as.vector()
n_class2 <- n_mt_cast$class2 %>% na.omit() %>% as.vector()
n_class3 <- n_mt_cast$class3 %>% na.omit() %>% as.vector()

Fig 5A, 5B

Fig.4 format, two figures, 1) binomial(19, 0.5); 2) class 2

p_obs <- tibble(CELL_ID=as.character(seq_len(length(n_class2))), n_mt=n_class2) %>%
  mutate(n_me=19 - n_mt) %>%
  arrange(desc(n_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
  theme_void() +
  ggtitle("Observed: class 2")

# simulate null distribution with p = 0.5
set.seed(12345)
p_null <- tibble(CELL_ID=as.character(seq_len(length(n_class2))), n_mt=rbinom(length(n_class2), 19, 0.5)) %>%
  mutate(n_me=19 - n_mt) %>%
  arrange(desc(n_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
  theme_void() +
  ggtitle("Null: p=0.5")

p_null

p_obs

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "fig5A_5B.pdf"), width=10, height=4)
  print(p_null)
  print(p_obs)
  dev.off()
}
## quartz_off_screen 
##                 2

Fig 5C, 5D, 5E

Fig.4 format, three figures, 1) class1_1 and class1_2 combined, 2) class1_1 only, 3) class1_1 only, break down pink and light green

# The hard way to do 2) class1_1 only since there's a column of mt in n_mt already
# But this is the way to do 3) class1_1 only, break down pink and light green
# count chromosomes per cell that are me or mt in m2 and hp separately
# me cells

me_per_cell_m2_cast <- m2_cast_df %>% 
  filter(SEGREGATION %in% c("me", "hp")) %>%
  distinct(CELL_ID, CHROM) %>%
  count(CELL_ID)
me_per_cell_m2_cast_upd <- m2_cast_upd %>% 
  filter(STRAIN %in% c("cast", "b6")) %>%
  distinct(CELL_ID, CHROM) %>% 
  count(CELL_ID)
drop0_per_cell <- m2_cast_upd %>%
  filter(STRAIN %in% c("drop0")) %>%
  distinct(CELL_ID, CHROM) %>%
  count(CELL_ID) %>%
  dplyr::rename(n_drop0=n)
me_per_cell_m2_merge_cast <- full_join(me_per_cell_m2_cast, me_per_cell_m2_cast_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>%
  full_join(drop0_per_cell, drop0_per_cell, by="CELL_ID") %>%
  replace_na(list(n_m2=0, n_m2_upd=0, n_drop0=0)) %>%
  mutate(SUM_m2=n_m2+n_m2_upd+n_drop0)

# mt cells
mt_per_cell_m2_cast <- m2_cast_df %>% 
  filter(SEGREGATION=="mt") %>%
  distinct(CELL_ID, CHROM) %>% 
  count(CELL_ID)
mt_per_cell_m2_cast_upd <- m2_cast_upd %>% 
  filter(STRAIN %in% c("het")) %>%
  distinct(CELL_ID, CHROM) %>% 
  count(CELL_ID)
mt_per_cell_m2_merge_cast <- full_join(mt_per_cell_m2_cast, mt_per_cell_m2_cast_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>% 
  replace_na(list(n_m2=0, n_m2_upd=0)) %>%
  mutate(SUM_m2=n_m2+n_m2_upd)

all_m2_merge_cast <- full_join(mt_per_cell_m2_merge_cast, me_per_cell_m2_merge_cast, by="CELL_ID", suffix=c("_mt","_me")) %>% 
  replace_na(list(n_m2_mt=0, n_m2_me=0, n_m2_upd_mt=0, n_m2_upd_me=0, SUM_m2_mt=0, SUM_m2_me=0)) %>%
  mutate(SUM_m2=SUM_m2_mt + SUM_m2_me) %>% 
  mutate(rate_m2_me=SUM_m2_me / (SUM_m2_mt + SUM_m2_me)) %>% 
  mutate(bp_m2=n_m2_mt + n_m2_me) %>%
  mutate(TYPE=case_when(SUM_m2_mt >= 15 ~ "Mitotic cell",
                        SUM_m2_me >= 15 ~ "Meiotic cell",
                        TRUE ~ "Others"))


p_3b2 <- all_m2_merge_cast %>%
  rowwise() %>%
  mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
  mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
  arrange(n_mt, desc(n_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_drop0", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_drop0", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1", "n_drop0"="black")) +
  theme_void()

p_3b3 <- all_m2_merge_cast %>%
  rowwise() %>%
  mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
  mutate(SORT1=n_m2_mt + n_m2_upd_mt) %>%
  mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
  arrange(n_mt, n_drop0, n_m2_upd_mt, desc(n_me), n_m2_upd_me) %>%
  # arrange(SORT1, n_m2_upd_mt, n_m2_mt, n_drop0, n_m2_upd_me, desc(n_m2_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_m2_me", "n_m2_upd_me", "n_drop0", "n_m2_mt", "n_m2_upd_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_m2_me", "n_m2_upd_me", "n_drop0", "n_m2_mt", "n_m2_upd_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_m2_me"="mistyrose", "n_m2_upd_me"="firebrick1", "n_m2_mt"="darkolivegreen1", "n_m2_upd_mt"="deepskyblue", "n_drop0"="black")) +
  theme_void()

# 1) class1_1 and class1_2 combined, blue and red like p_3b2
n_class1_1 <- n_mt_cast$class1_1 %>% na.omit() %>% as.vector()
n_class1_2 <- n_mt_cast$class1_2 %>% na.omit() %>% as.vector()

p_3b1 <- all_m2_merge_cast %>%
  rowwise() %>%
  mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
  mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
  # combine the class1_2 mt counts by myself from n_mt
  bind_rows(
    tibble(CELL_ID=as.character(seq_len(length(n_class1_2))), n_mt=n_class1_2) %>%
      mutate(n_me=19 - n_mt)) %>% 
  arrange(n_mt, desc(n_me)) %>%
  mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
  gather(Category, Count, c("n_me", "n_drop0", "n_mt")) %>%
  mutate(Category=factor(Category, levels=c("n_me", "n_drop0", "n_mt"))) %>%
  ggplot(aes(CELL_ID, Count, fill=Category)) +
  geom_bar(stat='identity', colour=NA, width=0.8) +
  scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1", "n_drop0"="black")) +
  theme_void()

p_3b1

p_3b2

p_3b3

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "fig5CDE.pdf"), 10, 4)
  print(p_3b1)
  print(p_3b2)
  print(p_3b3)
  dev.off()
}
## quartz_off_screen 
##                 2

Fig S3H-J

Number of mt versus fitted mixture model, 1) class2; 2) class1; 3) class3

Not run when knitting Rmd due to time.

We fit the data to a mixture of three binomial distributions parameterized by p_1, p_2, p_3, respectively, denoting their probabilities of chromosomes segregating equationally. The relative contribution of these three binomial distributions are denoted by a length 3 vector theta. We estimate p_1, p_2, p_3 as well as theta by drawing samples from their posterior distributions using Stan with uniform Dirichlet prior for theta: theta ~ Dir(K=3, alpha=1), and beta prior for p: p ~ Beta(a=5, b=5).

# try fitting mixture of binomials for number of events
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
set.seed(12345)
dat1 <- list(N=length(n_class1), y=n_class1, n_groups=3)
fit1 <- stan(file = system.file("mt_mixture_model.stan", package="sciliantifig"), data = dat1, 
            iter = 1000, chains = 4)

dat2 <- list(N=length(n_class2), y=n_class2, n_groups=3)
fit2 <- stan(file = system.file("mt_mixture_model.stan", package="sciliantifig"), data = dat2, 
            iter = 1000, chains = 4)

dat3 <- list(N=length(n_class3), y=n_class3, n_groups=3)
fit3 <- stan(file = system.file("mt_mixture_model.stan", package="sciliantifig"), data = dat3, 
            iter = 1000, chains = 4)

# save(fit1, fit2, fit3, file="inst/extdata/mt_mixture_model_fits.rda")
print(fit1)
print(fit2)
print(fit3)

Continue and make plots for Fig S4A, S4B, S4C

##
par1 <- rstan::extract(fit1)
thetas <- colMeans(par1$Theta)
sizes <- round(100000 * thetas)

fit1_samples <- map_df(set_names(1:3), function(i) {
  tibble("Sample"=rbinom(sizes[i], 19, par1$p[, i]))
}, .id="Group")

p1_fitted <- ggplot(fit1_samples, aes(Sample, fill=Group)) +
  geom_bar(aes(y = (..count..)/sum(..count..)), alpha=0.5) +
  scale_fill_manual(name="Mixing component", values=c("orangered", "grey80", "dodgerblue")) +
  xlab("# of equational chromosome segregations") +
  ylab("Estimated fraction\n") +
  theme_classic() +
  # theme(legend.justification = c(1, 1), legend.position = c(1, 1)) +
  NULL

p1_obs <- data.frame(n_mt=n_class1) %>%
  count(n_mt) %>%
  right_join(tibble(n_mt=0:19)) %>%
  replace_na(list(n=0)) %>%
  mutate(n_me = 19 - n_mt) %>%
  mutate(Group=case_when(n_mt >= 15 ~ "Mitotic",
                         n_me >= 15 ~ "Meiotic",
                         TRUE ~ "Others")) %>%
  # mutate(n_mt_fct = factor(.$n_mt, levels=as.character(0:19))) %>%
  ggplot(aes(n_mt, n, fill=Group)) +
  geom_bar(stat="identity") +
  scale_fill_manual(name="Mixing component", values=c("Meiotic"="orangered", "Mitotic"="dodgerblue", "Others"="grey80")) +
  xlab("# of equational chromosome segregations") +
  ylab("# of cells\n") +
  theme_classic() +
  # theme(legend.justification = c(1, 1), legend.position = c(1, 1)) +
  NULL

##
par2 <- rstan::extract(fit2)
thetas <- colMeans(par2$Theta)
sizes <- round(100000 * thetas)

fit2_samples <- map_df(set_names(1:3), function(i) {
  tibble("Sample"=rbinom(sizes[i], 19, par2$p[, i]))
}, .id="Group")

p2_fitted <- ggplot(fit2_samples, aes(Sample, fill=Group)) +
  geom_bar(aes(y = (..count..)/sum(..count..)), alpha=0.5) +
  scale_fill_manual(name="Mixing component", values=c("orangered", "grey80", "dodgerblue")) +
  xlab("# of equational chromosome segregations") +
  ylab("Estimated fraction\n") +
  theme_classic()

p2_obs <- data.frame(n_mt=n_class2) %>%
  count(n_mt) %>%
  right_join(tibble(n_mt=0:19)) %>%
  replace_na(list(n=0)) %>%
  mutate(n_me = 19 - n_mt) %>%
  mutate(Group=case_when(n_mt >= 15 ~ "Mitotic",
                         n_me >= 15 ~ "Meiotic",
                         TRUE ~ "Others")) %>%
  # mutate(n_mt_fct = factor(.$n_mt, levels=as.character(0:19))) %>%
  ggplot(aes(n_mt, n, fill=Group)) +
  geom_bar(stat="identity") +
  scale_fill_manual(name="Mixing component", values=c("Meiotic"="orangered", "Mitotic"="dodgerblue", "Others"="grey80")) +
  xlab("# of equational chromosome segregations") +
  ylab("# of cells\n") +
  theme_classic()

##
par3 <- rstan::extract(fit3)
thetas <- colMeans(par3$Theta)
sizes <- round(100000 * thetas)

fit3_samples <- map_df(set_names(1:3), function(i) {
  tibble("Sample"=rbinom(sizes[i], 19, par3$p[, i]))
}, .id="Group")

p3_fitted <- ggplot(fit3_samples, aes(Sample, fill=Group)) +
  geom_bar(aes(y = (..count..)/sum(..count..)), alpha=0.5) +
  scale_fill_manual(name="Mixing component", values=c("orangered", "grey80", "dodgerblue")) +
  xlab("# of equational chromosome segregations") +
  ylab("Estimated fraction\n") +
  theme_classic()

p3_obs <- data.frame(n_mt=n_class3) %>%
  count(n_mt) %>%
  right_join(tibble(n_mt=0:19)) %>%
  replace_na(list(n=0)) %>%
  mutate(n_me = 19 - n_mt) %>%
  mutate(Group=case_when(n_mt >= 15 ~ "Mitotic",
                         n_me >= 15 ~ "Meiotic",
                         TRUE ~ "Others")) %>%
  # mutate(n_mt_fct = factor(.$n_mt, levels=as.character(0:19))) %>%
  ggplot(aes(n_mt, n, fill=Group)) +
  geom_bar(stat="identity") +
  scale_fill_manual(name="Mixing component", values=c("Meiotic"="orangered", "Mitotic"="dodgerblue", "Others"="grey80")) +
  xlab("# of equational chromosome segregations") +
  ylab("# of cells\n") +
  theme_classic()

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS4A.pdf"), 4, 5)
  grid::grid.draw(rbind(ggplotGrob(p1_fitted), ggplotGrob(p1_obs), size="last"))
  dev.off()
  pdf(file.path(fig_dir, "figS4B.pdf"), 4, 5)
  grid::grid.draw(rbind(ggplotGrob(p2_fitted), ggplotGrob(p2_obs), size="last"))
  dev.off()
  pdf(file.path(fig_dir, "figS4C.pdf"), 4, 5)
  grid::grid.draw(rbind(ggplotGrob(p3_fitted), ggplotGrob(p3_obs), size="last"))
  dev.off()
}

Meiotic crossover and uniparental chromosome distributions at the chromosome scale

mm10_fai_df <- read_tsv(slfile("mm10.fa.fai"), col_names=c("CHROM", "CHROM_SIZE", "BI", "BASES_PER_LINE", "BYTES_PER_LINE"))
m2_spret_df <- read_tsv(slfile("m2diploid.concensus.mb.spret.tab"))
m2_spret_upd <- read_tsv(slfile("m2.upd.spret.tab"))

Fig S4A

Number of haploid break points normalized by chromosome size ~ chromosome size

df <- hb_spret_df %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(hb_spret_df$CELL_ID)) / CHROM_SIZE * 2 * 100)

p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.6624745 
## p-value:
##  0.001997309
if (!is.na(fig_dir)) ggsave("figS5A.pdf", p, "pdf", fig_dir, width=3, height=3)

Fig S4B

Number of M2 diploid break points normalized by chromosome size ~ chromosome size

df <- m2_spret_df %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(m2_spret_df$CELL_ID)) / CHROM_SIZE * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.8328771 
## p-value:
##  9.606707e-06
if (!is.na(fig_dir)) ggsave("figS5B.pdf", p, "pdf", fig_dir, width=3, height=3)

old Fig S6A (deleted)

Number of haploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size

df <- hb_spret_df %>%
  distinct(CELL_ID, CHROM) %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(hb_spret_df$CELL_ID)) / CHROM_SIZE * 2 * 100)

p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (alt_cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.8667679 
## p-value:
##  1.579717e-06
if (!is.na(fig_dir)) ggsave("figS6A.pdf", p, "pdf", fig_dir, width=3, height=3)

old Fig S6B (deleted)

Number of M2 diploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size

df <- m2_spret_df %>%
  distinct(CELL_ID, CHROM) %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(m2_spret_df$CELL_ID)) / CHROM_SIZE * 100)

p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (alt_cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.9079146 
## p-value:
##  7.905036e-08
if (!is.na(fig_dir)) ggsave("figS6B.pdf", p, "pdf", fig_dir, width=3, height=3)

CO count per chromosome within 1 cell

Fig S4C (and Fig S4E)

Need cast data in addition to spretus here

co_per_chr_hb_spret_df <- hb_spret_df %>%
  group_by(CELL_ID, CHROM) %>%
  summarise(N_BREAKS_CHROM=n()) %>%
  ungroup() %>%
  right_join(expand.grid(unique(hb_spret_df$CELL_ID), setdiff(unique(hb_spret_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
  replace_na(list(N_BREAKS_CHROM=0L))

co_per_chr_hb_cast_df <- hb_cast_df %>%
  group_by(CELL_ID, CHROM) %>%
  summarise(N_BREAKS_CHROM=n()) %>%
  ungroup() %>%
  right_join(expand.grid(unique(hb_cast_df$CELL_ID), setdiff(unique(hb_cast_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
  replace_na(list(N_BREAKS_CHROM=0L))

df <- count(co_per_chr_hb_spret_df, N_BREAKS_CHROM) %>%
  mutate(Strain="B6/Spret") %>%
  mutate(freq=n/sum(n)) %>%
  bind_rows(count(co_per_chr_hb_cast_df, N_BREAKS_CHROM) %>%
              mutate(Strain="B6/Cast") %>%
              mutate(freq=n/sum(n)))

p_freq <- ggplot(df, aes(N_BREAKS_CHROM, freq, group=Strain)) +
  geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
  scale_fill_manual(values=c("orangered", "coral")) +
  xlab("# of CO per chr") + 
  ylab("Frequency") +
  scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
  theme_classic()

p_n <- ggplot(df, aes(N_BREAKS_CHROM, n, group=Strain)) +
  geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
  scale_fill_manual(values=c("orangered", "coral")) +
  xlab("# of CO per chr") + 
  ylab("Count") +
  scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
  theme_classic()

p_freq

p_n

if (!is.na(fig_dir)) ggsave("figS5C.pdf", p_freq, "pdf", fig_dir, width=4, height=2.5)
if (!is.na(fig_dir)) ggsave("figS6C.pdf", p_n, "pdf", fig_dir, width=4, height=2.5)

Fig S4D (and Fig S4F)

co_per_chr_m2_spret_df <- m2_spret_df %>%
  group_by(CELL_ID, CHROM) %>%
  summarise(N_BREAKS_CHROM=n()) %>%
  ungroup() %>%
  right_join(expand.grid(unique(m2_spret_df$CELL_ID), setdiff(unique(m2_spret_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
  replace_na(list(N_BREAKS_CHROM=0L))

co_per_chr_m2_cast_df <- m2_cast_df %>%
  group_by(CELL_ID, CHROM) %>%
  summarise(N_BREAKS_CHROM=n()) %>%
  ungroup() %>%
  right_join(expand.grid(unique(m2_cast_df$CELL_ID), setdiff(unique(m2_cast_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
  replace_na(list(N_BREAKS_CHROM=0L))

df <- count(co_per_chr_m2_spret_df, N_BREAKS_CHROM) %>%
  mutate(Strain="B6/Spret") %>%
  mutate(freq=n/sum(n)) %>%
  bind_rows(count(co_per_chr_m2_cast_df, N_BREAKS_CHROM) %>%
              mutate(Strain="B6/Cast") %>%
              mutate(freq=n/sum(n)))

p_freq <- ggplot(df, aes(N_BREAKS_CHROM, freq, group=Strain)) +
  geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
  scale_fill_manual(values=c("dodgerblue", "deepskyblue")) +
  xlab("# of CO per chr") + 
  ylab("Frequency") +
  scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
  theme_classic()

p_n <- ggplot(df, aes(N_BREAKS_CHROM, n, group=Strain)) +
  geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
  scale_fill_manual(values=c("dodgerblue", "deepskyblue")) +
  xlab("# of CO per chr") + 
  ylab("Count") +
  scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
  theme_classic()

p_freq

p_n

if (!is.na(fig_dir)) ggsave("figS5D.pdf", p_freq, "pdf", fig_dir, width=4, height=2.5)
if (!is.na(fig_dir)) ggsave("figS6D.pdf", p_n, "pdf", fig_dir, width=4, height=2.5)

Uniparental chromosome distributions

Fig S4H

all_spret_upd <- read_tsv(slfile("all.upd.spret.tab"))

all_spret_upd_w_cellstatus <- all_spret_upd %>%
  mutate(CELL_STATUS=case_when(CELL_ID %in% hb_spret_df$CELL_ID ~ "Haploid", 
                               CELL_ID %in% m2_spret_df$CELL_ID ~ "M2 diploid", 
                               TRUE ~ "Other"))

# Uniparental chromosome dist for M2 diploid cells
m2_spret_hist_input <- m2_spret_upd %>% 
  group_by(CELL_ID) %>%
  summarise(count=sum(STRAIN %in% c("spret","b6"))) %>%
  group_by(count) %>%
  summarise(number=n())
m2_spret_hist_input[1,2] <- m2_spret_hist_input[1,2] + 41L 
# 41 cells with crossover on every chromosome (240 - 199)

# Uniparental chromosome dist for haploid cells
all_spret_upd_chr_per_cell_dist <- all_spret_upd %>%
  left_join(haploid_spret_upd, by=c("CELL_ID", "CHROM")) %>%
  mutate(CELL_STATUS=case_when(CELL_ID %in% m2_spret_df$CELL_ID ~ "M2 diploid",
                               CELL_ID %in% hb_spret_df$CELL_ID ~ "Haploid",
                               TRUE ~ "Other")) %>%
  mutate(UPD_STATUS=case_when(STRAIN %in% c("b6", "spret") ~ "UPD",
                              TRUE ~ "NOUPD")) %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  group_by(CELL_ID, CELL_STATUS) %>%
  summarise(N_UPD_CHR=sum(UPD_STATUS == "UPD")) %>%
  group_by(CELL_STATUS, N_UPD_CHR) %>%
  summarise(N_CELL=n()) %>%
  mutate(N_UPD_CHR=as.integer(N_UPD_CHR), N_CELL=as.integer(N_CELL))

all_spret_upd_chr_per_cell_dist[18, 2:3] <- c(0L, 41L) # doesn't seem to matter because only haploid will be used from this table but anyways...

df <- m2_spret_hist_input %>%
  mutate(CELL_STATUS="M2 diploid") %>%
  dplyr::rename(N_CELL=number, N_UPD_CHR=count) %>%
  bind_rows(all_spret_upd_chr_per_cell_dist %>%
              filter(CELL_STATUS == "Haploid")) %>%
  bind_rows(all_spret_upd_w_cellstatus %>%
              filter(CELL_STATUS == "Other") %>%
              group_by(CELL_ID) %>%
              summarise(N_UPD_CHR=sum(UPD_STATUS != "OTHER")) %>%
              group_by(N_UPD_CHR) %>%
              summarise(N_CELL=n()) %>%
              mutate(CELL_STATUS="Other"))

p <- ggplot(df, aes(N_UPD_CHR, N_CELL)) +
  geom_bar(stat='identity') +
  xlab("# of uniparental chromosomes") +
  ylab("# of cells") +
  facet_wrap(~CELL_STATUS, scales='free_y') +
  theme_classic() +
  theme(strip.background = element_blank())

p

if (!is.na(fig_dir)) ggsave("figS5F.pdf", p, "pdf", fig_dir, width=4, height=1.5)

Fig S4I

all_spret_upd_chr_dist_other <- all_spret_upd_w_cellstatus %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  filter(CELL_STATUS=="Other") %>%
  group_by(CELL_STATUS, CHROM) %>%
  summarise(N_CELL_WITH_UPD_CHR=sum(UPD_STATUS %in% c("REF", "ALT")))
all_spret_upd_chr_dist_hp <- haploid_spret_upd %>%
  mutate(CELL_STATUS="Haploid") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  group_by(CELL_STATUS, CHROM) %>%
  summarise(N_CELL_WITH_UPD_CHR=sum(STRAIN %in% c("spret", "b6")))
all_spret_upd_chr_dist_m2 <- m2_spret_upd %>%
  mutate(CELL_STATUS="M2 diploid") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  group_by(CELL_STATUS, CHROM) %>%
  summarise(N_CELL_WITH_UPD_CHR=sum(STRAIN %in% c("spret", "b6")))

all_spret_upd_chr_dist<-bind_rows(all_spret_upd_chr_dist_hp, all_spret_upd_chr_dist_m2, all_spret_upd_chr_dist_other) %>%
  mutate(CHROM=as.integer(gsub("chr", "", CHROM)))

p <- ggplot(all_spret_upd_chr_dist, aes(CHROM, N_CELL_WITH_UPD_CHR)) +
  geom_bar(stat='identity') +
  xlab("Chromosome") +
  ylab("# of cells") +
  scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
  facet_wrap(~CELL_STATUS, scales='free_y') +
  theme_classic() +
  theme(strip.background = element_blank())

p

if (!is.na(fig_dir)) ggsave("figS5G.pdf", p, "pdf", fig_dir, width=4, height=1.5)

Distance between two crossovers co-occurring on the same chromosome

Fig S4G (also include all chr. not shown)

# sampled null tracts, use them to calculate null distribution of co distances
hb_spret_ctrl_df <- read_tsv(slfile("hb_sample.01.bed"), col_names=c("CHROM", "START", "END"))
m2_spret_ctrl_df <- read_tsv(slfile("m2-sampled-track_01.spret.tab"))
bp_spret_ctrl_df <- bind_rows(hb_spret_ctrl_df, m2_spret_ctrl_df)

# true distances
bp_spret_df <- bind_rows(hb_spret_df %>% mutate(TYPE="haploid"),
                         m2_spret_df %>% mutate(TYPE="m2 diploid"))

bp_spret_distance_df <- map_df(set_names(unique(bp_spret_df$TYPE)), function(type) {
  map_df(set_names(setdiff(unique(bp_spret_df$CHROM), c("chrX", "chrY"))), function(chr) {
    chr_df <- filter(bp_spret_df, CHROM == chr & TYPE == type)
    map_df(set_names(unique(chr_df$CELL_ID)), function(cell) {
      cc_df <- filter(chr_df, CELL_ID == cell)
      if (nrow(cc_df) < 2) return(NULL)
      dists <- c()
      for (i in seq_len(nrow(cc_df) - 1)) {
        j <- i + 1 # use distance of adjacent breakpoints only
        dists <- c(dists, bp_distance(cc_df[[i, "START"]], cc_df[[i, "END"]], cc_df[[j, "START"]], cc_df[[j, "END"]]))
      }
      return(tibble(DISTANCE=dists))
    }, .id="CELL_ID")
  }, .id="CHROM")
}, .id="TYPE")

bp_spret_distance_df_for_plot <- bp_spret_distance_df %>%
  mutate(CHROM="ALL") %>%
  bind_rows(bp_spret_distance_df)

bp_spret_distance_df_for_plot$CHROM <- factor(bp_spret_distance_df_for_plot$CHROM, levels=c(chr_levels, "ALL"))

sizes <- table(bp_spret_distance_df$CHROM)
bp_spret_ctrl_distance_df <- map_df(names(sizes), function(c) {
  choices <- bp_spret_ctrl_df %>%
    filter(CHROM == c)
  idx1 <- sample(seq_len(nrow(choices)))
  idx2 <- sample(seq_len(nrow(choices)))
  ret <- tibble()
  sampled_size <- 0
  while (sampled_size < sizes[c]) {
    sampled_choices <- bind_cols(choices[idx1, ], choices[idx2, ]) %>%
    filter((START < START1 & END < START1) | (START > END1 & START > START1)) %>%
    rowwise() %>%
    mutate(DISTANCE=bp_distance(START, END, START1, END1))
    ret <- distinct(bind_rows(ret, sampled_choices))
    ret <- filter(ret, DISTANCE > 1e5) # min_block size in call_state_block is 1e5 so
    sampled_size <- nrow(ret)
  }
  return(ret[1:sizes[c], ])
})

bp_spret_ctrl_distance_df_for_plot <- bind_rows(bp_spret_ctrl_distance_df %>% mutate(CHROM="ALL"),
                                                bp_spret_ctrl_distance_df)

p_4e <- bind_rows(bp_spret_distance_df_for_plot %>% mutate(TYPE=case_when(TYPE=="haploid" ~ "Haploid", TYPE=="m2 diploid" ~ "M2 diploid")),
               bp_spret_ctrl_distance_df_for_plot %>% mutate(TYPE="Expected")) %>%
  mutate(CHROM=factor(CHROM, levels=c(chr_levels, "ALL"))) %>%
  filter(CHROM %in% c("chr1", "chr2", "chr12", "chr13")) %>% # uncomment if fig 4e
  ggplot(aes(DISTANCE / 1e6, fill=TYPE)) +
  geom_histogram(position="dodge", bins=20) +
  scale_fill_manual(name="", values=c("Haploid"="orangered", "M2 diploid"="dodgerblue", "Expected"="grey80")) +
  facet_wrap(~CHROM, scales="free") +
  xlab("Distance between break points (M)") +
  ylab("Count") +
  theme_classic() +
  theme(strip.background = element_blank())

p_s5e <- bind_rows(bp_spret_distance_df_for_plot %>% mutate(TYPE=case_when(TYPE=="haploid" ~ "Haploid", TYPE=="m2 diploid" ~ "M2 diploid")),
               bp_spret_ctrl_distance_df_for_plot %>% mutate(TYPE="Expected")) %>%
  mutate(CHROM=factor(CHROM, levels=c(chr_levels, "ALL"))) %>%
  # filter(CHROM %in% c("chr1", "chr2", "chr12", "chr13")) %>% # uncomment if fig 4e
  ggplot(aes(DISTANCE / 1e6, fill=TYPE)) +
  geom_histogram(position="dodge", bins=20) +
  scale_fill_manual(name="", values=c("Haploid"="orangered", "M2 diploid"="dodgerblue", "Expected"="grey80")) +
  facet_wrap(~CHROM, scales="free") +
  xlab("Distance between break points (M)") +
  ylab("Count") +
  theme_classic() +
  theme(strip.background = element_blank())

p_4e

p_s5e

# observed distance mean and median, both haploid and m2 diploid combined
bp_spret_distance_df %>%
  summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
##   DISTANCE_MEAN DISTANCE_MEDIAN
##           <dbl>           <dbl>
## 1     80581864.       82033170.
# simulated null distance mean and median, both haploid and m2 diploid combined
bp_spret_ctrl_distance_df %>%
  summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
##   DISTANCE_MEAN DISTANCE_MEDIAN
##           <dbl>           <dbl>
## 1     47322986.        40138581
# wilcoxon test for distance
w <- wilcox.test(bp_spret_distance_df$DISTANCE, bp_spret_ctrl_distance_df$DISTANCE)
w$p.value
## [1] 3.774328e-271
if (!is.na(fig_dir)) ggsave("figS5E.pdf", p_4e, "pdf", fig_dir, width=5, height=4)
if (!is.na(fig_dir)) ggsave("figS6E.pdf", p_s5e, "pdf", fig_dir, width=8, height=4)

Proportion of mitochodria reads

Fig S4K

spret_all_cell_status <- read_tsv(slfile("all.cellstatus.spret.tab"), col_names=c("CELL_ID", "CELL_STATUS", "B", "C"))
spret_mt_prop_df <- read_tsv(slfile("all.MT.prop.summary.spret.tab"), col_names=c("CELL_ID", "PROP_MT"))

spret_mt_prop_cell_status <- left_join(spret_mt_prop_df, spret_all_cell_status, by="CELL_ID") %>%
  mutate(CELL_STATUS=case_when(CELL_ID %in% m2_spret_df$CELL_ID ~ "M2 diploid",
                               CELL_ID %in% hb_spret_df$CELL_ID ~ "Haploid",
                               TRUE ~ "Other"))

spret_mt_prop_m2d_status <- spret_mt_prop_cell_status %>%
  filter(CELL_STATUS == "M2 diploid") %>%
  mutate(CELL_STATUS=case_when(CELL_ID %in% filter(all_m2_merge, TYPE == "Meiotic cell")$CELL_ID ~ "M2 diploid (meiotic)",
                               CELL_ID %in% filter(all_m2_merge, TYPE == "Mitotic cell")$CELL_ID ~ "M2 diploid (mitotic)",
                               TRUE ~ "M2 diploid (other)"))

cell_status_levels <- c("Haploid", "M2 diploid", "Other", "M2 diploid (meiotic)", "M2 diploid (mitotic)", "M2 diploid (other)")

p <- ggplot(bind_rows(spret_mt_prop_cell_status, spret_mt_prop_m2d_status) %>% mutate(CELL_STATUS=factor(CELL_STATUS, levels=cell_status_levels)), aes(PROP_MT + 0.1)) +
  geom_histogram(bins=50) +
  scale_x_log10(limits=c(0.05, max(spret_mt_prop_cell_status$PROP_MT))) +
  facet_wrap(~CELL_STATUS, scales='free', nrow=2) +
  xlab("Proportion of mitochondria reads (normalized)") +
  ylab("Count") +
  theme_classic() +
  theme(strip.background = element_blank())

p

if (!is.na(fig_dir)) ggsave("figS5I_S6G.pdf", p, "pdf", fig_dir, width=5, height=3.5)

Fig S4J

chromosome distribution for “reverse segregation” (for all TYPE=“Meiotic cell” defined in “all_m2_merge”, sum up SEGREGATION=“mt” in m2_spret_df and STRAIN=“het” in m2_spret_upd, aggregate by chromosome, generate a df similar to “all_upd_chr_dist_m2”, call it “meiotic_rev_chr_dist_m2_spret”); same for cast.

# for spret
rev_cell_spret_ids <- filter(all_m2_merge, TYPE == "Meiotic cell")$CELL_ID
meiotic_rev_chr_dist_m2_spret <- m2_spret_df %>%
  filter(SEGREGATION == "mt") %>%
  distinct(CELL_ID, CHROM) %>%
  bind_rows(m2_spret_upd %>%
  filter(STRAIN == "het")) %>%
  filter(CELL_ID %in% rev_cell_spret_ids) %>%
  count(CHROM) %>%
  mutate(CHROM=as.integer(gsub("chr", "", CHROM)))
  
p_spret <- ggplot(meiotic_rev_chr_dist_m2_spret, aes(CHROM, n)) +
  geom_bar(stat='identity') +
  xlab("Chromosome") +
  ylab("# of cells") +
  scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
  theme_classic() +
  theme(strip.background = element_blank())

# for cast
rev_cell_cast_ids <- filter(all_m2_merge_cast, TYPE == "Meiotic cell")$CELL_ID
meiotic_rev_chr_dist_m2_cast <- m2_cast_df %>%
  filter(SEGREGATION == "mt") %>%
  distinct(CELL_ID, CHROM) %>%
  bind_rows(m2_cast_upd %>%
  filter(STRAIN == "het")) %>%
  filter(CELL_ID %in% rev_cell_cast_ids) %>%
  count(CHROM) %>%
  mutate(CHROM=as.integer(gsub("chr", "", CHROM)))
  
p_cast <- ggplot(meiotic_rev_chr_dist_m2_cast, aes(CHROM, n)) +
  geom_bar(stat='identity') +
  xlab("Chromosome") +
  ylab("# of cells") +
  scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
  theme_classic() +
  theme(strip.background = element_blank())

p_spret

p_cast

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS5H.pdf"), 1.5, 1.5)
  print(p_spret)
  print(p_cast)
  dev.off()
}
## quartz_off_screen 
##                 2
# test if the number of chromosomes with rev seg per cell is higher in spret than cast
n_revseg_chr_per_cell_spret_df <- m2_spret_df %>%
  filter(SEGREGATION == "mt") %>%
  distinct(CELL_ID, CHROM) %>%
  bind_rows(m2_spret_upd %>%
  filter(STRAIN == "het")) %>%
  filter(CELL_ID %in% rev_cell_spret_ids) %>%
  distinct(CELL_ID, CHROM) %>%
  count(CELL_ID) %>%
  right_join(tibble(CELL_ID=rev_cell_spret_ids), by="CELL_ID") %>%
  replace_na(list(n=0))

n_revseg_chr_per_cell_cast_df <- m2_cast_df %>%
  filter(SEGREGATION == "mt") %>%
  distinct(CELL_ID, CHROM) %>%
  bind_rows(m2_cast_upd %>%
  filter(STRAIN == "het")) %>%
  filter(CELL_ID %in% rev_cell_cast_ids) %>%
  distinct(CELL_ID, CHROM) %>%
  count(CELL_ID) %>%
  right_join(tibble(CELL_ID=rev_cell_cast_ids), by="CELL_ID") %>%
  replace_na(list(n=0))

exactRankTests::wilcox.exact(n_revseg_chr_per_cell_cast_df$n, n_revseg_chr_per_cell_spret_df$n)
## 
##  Asymptotic Wilcoxon rank sum test
## 
## data:  n_revseg_chr_per_cell_cast_df$n and n_revseg_chr_per_cell_spret_df$n
## W = 5682.5, p-value = 1.621e-14
## alternative hypothesis: true mu is not equal to 0

Paski cells

lib202_upd <- read_tsv(slfile("lib202_upd.tab"))

Fig S4M

# draw this for ref, alt and other
df <- lib202_upd %>%
  count(CHROM, UPD_STATUS) %>%
  filter(!CHROM %in% c("chr4", "chrX", "chrY")) %>%
  mutate(CHROM=as.numeric(gsub("chr", "", CHROM))) %>%
  mutate(UPD_STATUS=case_when(UPD_STATUS == "REF" ~ "B6",
                              UPD_STATUS == "ALT" ~ "Spret",
                              UPD_STATUS == "OTHER" ~ "Other")) %>%
  filter(UPD_STATUS != "Other")

p <- bind_rows(df %>% mutate(UPD_STATUS="B6 + Spret"),
               df) %>%
  mutate(UPD_STATUS=factor(UPD_STATUS, levels=c("B6", "Spret", "B6 + Spret"))) %>%
  ggplot(aes(CHROM, n)) +
  geom_bar(stat='identity') +
  facet_wrap(~UPD_STATUS, scales="free_y") +
  xlab("Chromosome") +
  ylab("# of cells") +
  scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
  theme_classic() +
  theme(strip.background = element_blank())

p

if (!is.na(fig_dir)) ggsave("figS6F1.pdf", p, "pdf", fig_dir, width=5, height=2)
# similar plot, separating mt, me, mt + me this time
df <- read_tsv(slfile("patski.mb.tab")) %>%
  count(CHROM, SEGREGATION) %>%
  filter(!CHROM %in% c("chr4", "chrX", "chrY")) %>%
  mutate(CHROM=as.numeric(gsub("chr", "", CHROM))) %>%
  mutate(SEGREGATION=case_when(SEGREGATION == "mt" ~ "Mitotic",
                               SEGREGATION == "me" ~ "Meiotic"))

p <- bind_rows(df %>% mutate(SEGREGATION="All LOH"),
               df) %>%
  mutate(SEGREGATION=factor(SEGREGATION, levels=c("Mitotic", "Meiotic", "All LOH"))) %>%
  ggplot(aes(CHROM, n)) +
  geom_bar(stat='identity') +
  facet_wrap(~SEGREGATION, scales="free_y") +
  xlab("Chromosome") +
  ylab("# of cells") +
  scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
  theme_classic() +
  theme(strip.background = element_blank())

p

if (!is.na(fig_dir)) ggsave("figS6F2.pdf", p, "pdf", fig_dir, width=5, height=2)

B6/Cast, Chromosome distributions for meiotic crossover and UPD

old Fig S7A (deleted, coefficient included)

Number of haploid break points normalized by chromosome size ~ chromosome size, cast

df <- hb_cast_df %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(hb_spret_df$CELL_ID)) / CHROM_SIZE * 2 * 100)

p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.6511605 
## p-value:
##  0.002529338
if (!is.na(fig_dir)) ggsave("figS7A.pdf", p, "pdf", fig_dir, width=3, height=3)

old Fig S7B (deleted, coefficient included)

Number of M2 diploid break points normalized by chromosome size ~ chromosome size, cast

df <- m2_cast_df %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(m2_spret_df$CELL_ID)) / CHROM_SIZE * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.8984839 
## p-value:
##  1.751928e-07
if (!is.na(fig_dir)) ggsave("figS7B.pdf", p, "pdf", fig_dir, width=3, height=3)

old Fig S7C (deleted)

Number of haploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size, cast

df <- hb_cast_df %>%
  distinct(CELL_ID, CHROM) %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(hb_cast_df$CELL_ID)) / CHROM_SIZE * 2 * 100)

p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (alt_cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.8479662 
## p-value:
##  4.537226e-06
if (!is.na(fig_dir)) ggsave("figS7C.pdf", p, "pdf", fig_dir, width=3, height=3)

old Fig S7D (deleted)

Number of M2 diploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size, cast

df <- m2_cast_df %>%
  distinct(CELL_ID, CHROM) %>%
  count(CHROM) %>%
  left_join(mm10_fai_df, by="CHROM") %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
  mutate(CHROM=gsub("chr", "", CHROM)) %>%
  mutate(NORM_N=n / length(unique(m2_cast_df$CELL_ID)) / CHROM_SIZE * 100)

p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
  geom_point(alpha=0.7) +
  geom_text_repel(aes(label=CHROM)) +
  xlab("Chromosome size (Mb)") +
  ylab("Crossover density (alt_cM / Mb)") +
  theme_classic()

p

r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
##  -0.9393099 
## p-value:
##  2.546956e-09
if (!is.na(fig_dir)) ggsave("figS7D.pdf", p, "pdf", fig_dir, width=3, height=3)

Fig. S4G (Cast), Distance between two crossovers co-occurring on the same chromosome

# sampled null tracts, use them to calculate null distribution of co distances
ctrl_cast_df <- read_tsv(slfile("cast/sample.srt.uniq.bed"), col_names=c("CHROM", "START", "END", "TYPE")) %>%
  mutate(TYPE=case_when(TYPE=="hp" ~ "Haploid", TYPE=="m2" ~ "M2 diploid"))

# true distances
bp_cast_df <- bind_rows(hb_cast_df %>% mutate(TYPE="Haploid"),
                        m2_cast_df %>% mutate(TYPE="M2 diploid"))

lst <- list()

# for(type in unique(bp_cast_df$TYPE)) {
#   for(chr in setdiff(unique(bp_cast_df$CHROM), c("chrX", "chrY"))) {
#     chr_df <- filter(bp_cast_df, CHROM == chr & TYPE == type)
#     for(cell in unique(chr_df$CELL_ID)) {
#       cc_df <- filter(chr_df, CELL_ID == cell)
#       if (nrow(cc_df) < 2) next
#       dists <- c()
#       for (i in seq_len(nrow(cc_df) - 1)) {
#         j <- i + 1 # use distance of adjacent breakpoints only
#         dists <- c(dists, bp_distance(cc_df[[i, "START"]], cc_df[[i, "END"]], cc_df[[j, "START"]], cc_df[[j, "END"]]))
#       }
#       lst[[length(lst) + 1]] <- dists
#     }
#   }
# }

bp_cast_distance_df <- map_df(set_names(unique(bp_cast_df$TYPE)), function(type) {
  map_df(set_names(setdiff(unique(bp_cast_df$CHROM), c("chrX", "chrY"))), function(chr) {
    chr_df <- filter(bp_cast_df, CHROM == chr & TYPE == type)
    map_df(set_names(unique(chr_df$CELL_ID)), function(cell) {
      cc_df <- filter(chr_df, CELL_ID == cell)
      if (nrow(cc_df) < 2) return(NULL)
      dists <- c()
      for (i in seq_len(nrow(cc_df) - 1)) {
        j <- i + 1 # use distance of adjacent breakpoints only
        dists <- c(dists, bp_distance(cc_df[[i, "START"]], cc_df[[i, "END"]], cc_df[[j, "START"]], cc_df[[j, "END"]]))
      }
      return(tibble(DISTANCE=dists))
    }, .id="CELL_ID")
  }, .id="CHROM")
}, .id="TYPE")

bp_cast_distance_df_for_plot <- bp_cast_distance_df %>%
  mutate(CHROM="ALL") %>%
  bind_rows(bp_cast_distance_df)

bp_cast_distance_df_for_plot$CHROM <- factor(bp_cast_distance_df_for_plot$CHROM, levels=c(chr_levels, "ALL"))

sizes <- table(bp_cast_distance_df$CHROM)
bp_ctrl_distance_cast_df <- map_df(names(sizes), function(c) {
  choices <- ctrl_cast_df %>%
    filter(CHROM == c)
  idx1 <- sample(seq_len(nrow(choices)))
  idx2 <- sample(seq_len(nrow(choices)))
  ret <- tibble()
  sampled_size <- 0
  while (sampled_size < sizes[c]) {
    sampled_choices <- bind_cols(choices[idx1, ], choices[idx2, ]) %>%
    filter((START < START1 & END < START1) | (START > END1 & START > START1)) %>%
    rowwise() %>%
    mutate(DISTANCE=bp_distance(START, END, START1, END1))
    ret <- distinct(bind_rows(ret, sampled_choices))
    ret <- filter(ret, DISTANCE > 1e5) # min_block size in call_state_block is 1e5 so
    sampled_size <- nrow(ret)
  }
  return(ret[1:sizes[c], ])
})

bp_ctrl_distance_cast_df_for_plot <- bind_rows(bp_ctrl_distance_cast_df %>% mutate(CHROM="ALL"),
                                               bp_ctrl_distance_cast_df)

p <- bind_rows(bp_cast_distance_df_for_plot,
               bp_ctrl_distance_cast_df_for_plot %>% mutate(TYPE="Expected")) %>%
  mutate(CHROM=factor(CHROM, levels=c(chr_levels, "ALL"))) %>%
  # filter(CHROM %in% c("chr1", "chr2", "chr12", "chr13")) %>% # uncomment if plotting all chromosomes
  ggplot(aes(DISTANCE / 1e6, fill=TYPE)) +
  geom_histogram(position="dodge", bins=20) +
  scale_fill_manual(name="", values=c("Haploid"="orangered", "M2 diploid"="dodgerblue", "Expected"="grey80")) +
  facet_wrap(~CHROM, scales="free") +
  xlab("Distance between break points (M)") +
  ylab("Count") +
  theme_classic() +
  theme(strip.background = element_blank())

p

# if (!is.na(fig_dir)) ggsave("fig4E.pdf", p, "pdf", fig_dir, width=5, height=4) # if only plotting subset 
if (!is.na(fig_dir)) ggsave("figS7E.pdf", p, "pdf", fig_dir, width=8, height=4)

# observed distance mean and median, both haploid and m2 diploid combined
bp_cast_distance_df %>%
  summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
##   DISTANCE_MEAN DISTANCE_MEDIAN
##           <dbl>           <int>
## 1     97212487.        97127834
# simulated null distance mean and median, both haploid and m2 diploid combined
bp_ctrl_distance_cast_df %>%
  summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
##   DISTANCE_MEAN DISTANCE_MEDIAN
##           <dbl>           <int>
## 1     49953736.        43243594
# wilcoxon test for distance
w <- wilcox.test(bp_cast_distance_df$DISTANCE, bp_ctrl_distance_cast_df$DISTANCE)
w$p.value
## [1] 0

Fig S4G (compare Spret and Cast)

Compare distances between breaks in B6/Spret and B6/Cast.

w <- wilcox.test(bp_spret_distance_df$DISTANCE, bp_cast_distance_df$DISTANCE)
w$p.value
## [1] 1.08051e-113
p <- bind_rows(bp_spret_distance_df %>% select(DISTANCE) %>% mutate(Strain="B6/Spret"),
          bp_cast_distance_df %>% select(DISTANCE) %>% mutate(Strain="B6/Cast")) %>%
  mutate(`Distance (Mb)`=DISTANCE / 1e6) %>%
  ggplot(aes(Strain, `Distance (Mb)`)) +
  geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5) +
  theme_bw() +
  scale_y_continuous(labels=scales::format_format(scientific = FALSE),
                     breaks=scales::pretty_breaks(n=4)) +
  theme(legend.position='none',
        axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())

p

if (!is.na(fig_dir)) ggsave("figS5E_2.pdf", p, "pdf", fig_dir, height=2.5, width=2)

Karyotype plots (deleted)

old Fig S8A (deleted)

bp_spret <- makeGRangesFromDataFrame(data.frame(
  bind_rows(hb_spret_df, m2_spret_df) %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_cast <- makeGRangesFromDataFrame(data.frame(
  bind_rows(hb_cast_df, m2_cast_df) %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_spret_hap <- makeGRangesFromDataFrame(data.frame(
  hb_spret_df %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_spret_m2 <- makeGRangesFromDataFrame(data.frame(
  m2_spret_df %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))

ssds_b6_df <- read_tsv(slfile("ssds_b6.tab"), col_names=c("chr", "start", "end", "signal"))
ssds_cast_df <- read_tsv(slfile("ssds_cast.tab"), col_names=c("chr", "start", "end", "signal"))
ssds_b6_cast_df <- read_tsv(slfile("ssds_b6_cast.tab"), col_names=c("chr", "start", "end", "signal"))

ssds_b6 <- makeGRangesFromDataFrame(data.frame(
  ssds_b6_df %>% dplyr::select(chr, start, end)))
ssds_cast <- makeGRangesFromDataFrame(data.frame(
  ssds_cast_df %>% dplyr::select(chr, start, end)))
ssds_b6_cast <- makeGRangesFromDataFrame(data.frame(
  ssds_b6_cast_df %>% dplyr::select(chr, start, end)))

kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_cast, col="green", r0=0, r1=0.19, free.y=TRUE)
kpPlotCoverage(kp, bp_spret, col="purple", r0=0.2, r1=0.39, free.y=TRUE)
kpPlotDensity(kp, ssds_b6_cast, col="grey10", r0=0.4, r1=0.59)
kpPlotDensity(kp, ssds_cast, col="orangered", r0=0.6, r1=0.79)
kpPlotDensity(kp, ssds_b6, col="dodgerblue", r0=0.8, r1=0.99)

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS8A.pdf"), 8, 20)
  kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
  kpPlotCoverage(kp, bp_cast, col="green", r0=0, r1=0.19, free.y=TRUE)
  kpPlotCoverage(kp, bp_spret, col="purple", r0=0.2, r1=0.39, free.y=TRUE)
  kpPlotDensity(kp, ssds_b6_cast, col="grey10", r0=0.4, r1=0.59)
  kpPlotDensity(kp, ssds_cast, col="orangered", r0=0.6, r1=0.79)
  kpPlotDensity(kp, ssds_b6, col="dodgerblue", r0=0.8, r1=0.99)
  dev.off()
}
## quartz_off_screen 
##                 2

old Fig S8B (deleted)

same as Fig. S8A, but with tracts for SSDS_F1, B6/Cast haploid, B6/Cast M2 diploid

ssds_f1 <- makeGRangesFromDataFrame(data.frame(
  read_tsv(slfile("ssds_b6_cast.tab"), col_names=c("chr", "start", "end", "signal")) %>% distinct()))
bp_cast_hap <- makeGRangesFromDataFrame(data.frame(
  hb_cast_df %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_cast_m2 <- makeGRangesFromDataFrame(data.frame(
  m2_cast_df %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))

kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_cast_m2, col="green", r0=0, r1=0.33, free.y=TRUE)
kpPlotCoverage(kp, bp_cast_hap, col="purple", r0=0.34, r1=0.66, free.y=TRUE)
kpPlotDensity(kp, ssds_f1, col="grey10", r0=0.67, r1=0.99)

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS8B.pdf"), 8, 20)
  kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
  kpPlotCoverage(kp, bp_cast_m2, col="green", r0=0, r1=0.33, free.y=TRUE)
  kpPlotCoverage(kp, bp_cast_hap, col="purple", r0=0.34, r1=0.66, free.y=TRUE)
  kpPlotDensity(kp, ssds_f1, col="grey10", r0=0.67, r1=0.99)
  dev.off()
}
## quartz_off_screen 
##                 2

Fig. S8C

same as Fig. S8A, but with tracts for Spo11_both (dodgerblue), Spo11_with all hits(orangered), B6/Spret haploid(purple), B6/Spret M2 diploid (green)

spo11_both <- makeGRangesFromDataFrame(data.frame(
  read_tsv(slfile("spo11_b6_and_spret.tab")) %>%
    distinct() %>%
    dplyr::rename(chr=CHROM, start=START, end=END, signal=SPO11_BOTH)))
spo11_all_hits <- makeGRangesFromDataFrame(data.frame(
  read_tsv(slfile("B6.Spo11.withHits.txt")) %>%
    dplyr::select(chr, Start, End, Hits) %>%
    distinct() %>%
    dplyr::rename(start=Start, end=End, signal=Hits)
))
bp_spret_hap <- makeGRangesFromDataFrame(data.frame(
  hb_spret_df %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_spret_m2 <- makeGRangesFromDataFrame(data.frame(
  m2_spret_df %>%
    dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))

kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_spret_m2, col="green", r0=0, r1=0.24, free.y=TRUE)
kpPlotCoverage(kp, bp_spret_hap, col="purple", r0=0.25, r1=0.49, free.y=TRUE)
kpPlotDensity(kp, spo11_all_hits, col="orangered", r0=0.50, r1=0.74)
kpPlotDensity(kp, spo11_both, col="dodgerblue", r0=0.75, r1=0.99)

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS8C.pdf"), 8, 20)
  kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
  kpPlotCoverage(kp, bp_spret_m2, col="green", r0=0, r1=0.24, free.y=TRUE)
  kpPlotCoverage(kp, bp_spret_hap, col="purple", r0=0.25, r1=0.49, free.y=TRUE)
  kpPlotDensity(kp, spo11_all_hits, col="orangered", r0=0.50, r1=0.74)
  kpPlotDensity(kp, spo11_both, col="dodgerblue", r0=0.75, r1=0.99)
  dev.off()
}
## quartz_off_screen 
##                 2

Fig 6C, 6D, S7

B6/Spret, M2 diploid, all_m2_merge, 38 TYPE=Mitotic cells, 202 TYPE=Meiotic cell 1. df1: take each chromosome, only keep the coordinate closest to CEN (smallest coordinate), ignore other crossovers on the same chromosome, plot along chr. coordinate, one color for mt, one color for me, do this for each of the 19 chr. 2. df2: same as above, only keep the coordinate farthest from CEN 3. df3: aggregate all chromosome, make one plot, normalize by chr. size, i.e., instead of showing chr. coordinate on the X axis for each chr., only make one plot, X axis from 0 (left end of chr) to 1 (right end of chr). Do the same thing for B6/Cast, M2 diploid, all_m2_merge_cast, 8 TYPE=Mitotic cells, 107 TYPE=Meiotic cell

m2_spret_l_r <- m2_spret_df %>%
  right_join(filter(all_m2_merge, TYPE %in% c("Meiotic cell", "Mitotic cell")), by="CELL_ID") %>%
  mutate(TYPE=case_when(TYPE == "Meiotic cell" ~ "Reductionally segregating cells",
                        TYPE == "Mitotic cell" ~ "Equationally segregating cells")) %>%
  mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
  group_by(CELL_ID, CHROM, TYPE) %>%
  summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
  ungroup() %>%
  mutate(CHROM=factor(CHROM, levels=chr_levels))

m2_cast_l_r <- m2_cast_df %>%
  right_join(filter(all_m2_merge_cast, TYPE %in% c("Meiotic cell", "Mitotic cell")), by="CELL_ID") %>%
  mutate(TYPE=case_when(TYPE == "Meiotic cell" ~ "Reductionally segregating cells",
                        TYPE == "Mitotic cell" ~ "Equationally segregating cells")) %>%
  mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
  group_by(CELL_ID, CHROM, TYPE) %>%
  summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
  ungroup() %>%
  mutate(CHROM=factor(CHROM, levels=chr_levels))

hb_spret_l_r <- hb_spret_df %>%
  filter(CHROM %in% chr_levels) %>%
  mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
  group_by(CELL_ID, CHROM) %>%
  summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
  ungroup() %>%
  mutate(CHROM=factor(CHROM, levels=chr_levels))

hb_cast_l_r <- hb_cast_df %>%
  filter(CHROM %in% chr_levels) %>%
  mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
  group_by(CELL_ID, CHROM) %>%
  summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
  ungroup() %>%
  mutate(CHROM=factor(CHROM, levels=chr_levels))

this_theme <- theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.ticks.y = element_blank(),
        axis.text.y = element_blank(), axis.title.y = element_blank(),
        axis.line.y=element_blank(),
        axis.line.x=element_line(colour="black"),
        strip.text.y=element_text(angle=180),
        strip.background=element_rect(fill=NA))

trim <- TRUE

# p_spret_m2_left <- ggplot(m2_spret_l_r) +
#   geom_vline(aes(xintercept=MIN_MID), alpha=0.2) +
#   stat_density(aes(MIN_MID), color="orangered", geom = "line") +
#   facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
#   xlab("Chromosome coordinate (Mb)") +
#   xlim(c(0, 200)) +
#   ggtitle("B6/Spret, left most break") +
#   this_theme

# md_spret_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM), data=m2_spret_l_r)

p_spret_m2_right <- ggplot(m2_spret_l_r) +
  geom_vline(aes(xintercept=MAX_MID), alpha=0.2) +
  stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
  facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
  xlab("Chromosome coordinate (Mb)") +
  xlim(c(0, 200)) +
  ggtitle("B6/Spret, right most break") +
  this_theme

md_spret_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM), data=m2_spret_l_r)

# p_cast_m2_left <- ggplot(m2_cast_l_r) +
#   geom_vline(aes(xintercept=MIN_MID), alpha=0.2) +
#   stat_density(aes(MIN_MID), color="orangered", geom = "line") +
#   facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
#   xlab("Chromosome coordinate (Mb)") +
#   xlim(c(0, 200)) +
#   ggtitle("B6/Cast, left most break") +
# this_theme
# 
# md_cast_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM), data=m2_cast_l_r)

p_cast_m2_right <- ggplot(m2_cast_l_r) +
  geom_vline(aes(xintercept=MAX_MID), alpha=0.2) +
  stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
  facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
  xlab("Chromosome coordinate (Mb)") +
  xlim(c(0, 200)) +
  ggtitle("B6/Cast, right most break") +
this_theme

md_cast_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM), data=m2_cast_l_r)

# p_hb_spret_cast_left <- bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
#                                   hb_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
#   ggplot() +
#   geom_vline(aes(xintercept=MIN_MID), alpha=0.02) +
#   stat_density(aes(MIN_MID), color="orangered", geom = "line") +
#   facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
#   xlab("Chromosome coordinate (Mb)") +
#   xlim(c(0, 200)) +
#   ggtitle("Haploid cells, left most break") +
#   this_theme
# 
# md_hb_spret_cast_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
#                                         data=bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
#                                                        hb_cast_l_r %>% mutate(TYPE="B6/Cast")))
# 

# S10A
p_hb_spret_cast_right <- bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
                                   hb_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
  ggplot() +
  geom_vline(aes(xintercept=MAX_MID), alpha=0.02) +
  stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
  facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
  xlab("Chromosome coordinate (Mb)") +
  xlim(c(0, 200)) +
  ggtitle("Haploid cells, right most break") +
  this_theme

md_hb_spret_cast_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
                                         data=bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
                                                        hb_cast_l_r %>% mutate(TYPE="B6/Cast")))
# 
# p_m2_spret_cast_left <- bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
#                                   m2_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
#   ggplot() +
#   geom_vline(aes(xintercept=MIN_MID), alpha=0.1) +
#   stat_density(aes(MIN_MID), color="orangered", geom = "line") +
#   facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
#   xlab("Chromosome coordinate (Mb)") +
#   xlim(c(0, 200)) +
#   ggtitle("M2 diploid cells, left most break") +
#   this_theme
# 
# md_m2_spret_cast_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
#                                         data=bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
#                                                        m2_cast_l_r %>% mutate(TYPE="B6/Cast")))
# 

# 5C
p_m2_spret_cast_right <- bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
                                   m2_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
  ggplot() +
  geom_vline(aes(xintercept=MAX_MID), alpha=0.1) +
  stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
  facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
  xlab("Chromosome coordinate (Mb)") +
  xlim(c(0, 200)) +
  ggtitle("M2 diploid cells, right most break") +
  this_theme

md_m2_spret_cast_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
                                         data=bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
                                                        m2_cast_l_r %>% mutate(TYPE="B6/Cast")))
# 
# p_spret_hb_m2_left <- bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
#                                 m2_spret_l_r %>% mutate(TYPE="M2 diploid cells")) %>%
#   ggplot() +
#   geom_vline(aes(xintercept=MIN_MID), alpha=0.02) +
#   stat_density(aes(MIN_MID), color="orangered", geom = "line") +
#   facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
#   xlab("Chromosome coordinate (Mb)") +
#   xlim(c(0, 200)) +
#   ggtitle("B6/Spret, left most break") +
#   this_theme
# 
# md_spret_hb_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
#                                       data=bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
#                                                      m2_spret_l_r %>% mutate(TYPE="M2 diploid cells")))
# 

# 5D
p_spret_hb_m2_right <- bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
                                 m2_spret_l_r %>% mutate(TYPE="M2 cells")) %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  ggplot() +
  geom_vline(aes(xintercept=MAX_MID, alpha=TYPE)) +
  stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
  facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
  xlab("Chromosome coordinate (Mb)") +
  xlim(c(0, 200)) +
  scale_alpha_manual(guide=FALSE, values=c("Haploid cells"=0.02, "M2 cells"=0.1)) +
  ggtitle("B6/Spret, right most break") +
  this_theme

md_spret_hb_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
                                       data=bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
                                                      m2_spret_l_r %>% mutate(TYPE="M2 cells")))
# 
# p_cast_hb_m2_left <- bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
#                                m2_cast_l_r %>% mutate(TYPE="M2 diploid cells")) %>%
#   ggplot() +
#   geom_vline(aes(xintercept=MIN_MID), alpha=0.02) +
#   stat_density(aes(MIN_MID), color="orangered", geom = "line") +
#   facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
#   xlab("Chromosome coordinate (Mb)") +
#   xlim(c(0, 200)) +
#   ggtitle("B6/Cast, left most break") +
#   this_theme
# 
# md_cast_hb_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
#                                      data=bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
#                                                     m2_cast_l_r %>% mutate(TYPE="M2 diploid cells")))
#

# S10B
p_cast_hb_m2_right <- bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
                                m2_cast_l_r %>% mutate(TYPE="M2 cells")) %>%
  filter(!CHROM %in% c("chrX", "chrY")) %>%
  ggplot() +
  geom_vline(aes(xintercept=MAX_MID, alpha=TYPE)) +
  stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
  facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
  xlab("Chromosome coordinate (Mb)") +
  xlim(c(0, 200)) +
  scale_alpha_manual(guide=FALSE, values=c("Haploid cells"=0.01, "M2 cells"=0.2)) +
  ggtitle("B6/Cast, right most break") +
  this_theme

md_cast_hb_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
                                      data=bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
                                                     m2_cast_l_r %>% mutate(TYPE="M2 cells")))

p_m2_spret_cast_right

p_spret_hb_m2_right

p_hb_spret_cast_right

p_cast_hb_m2_right

p_cast_m2_right

p_spret_m2_right

library(gridExtra)
if (!is.na(fig_dir)) {

  pdf(file.path(fig_dir, "fig6CD.pdf"), 6.5, 5.5)
  print(p_m2_spret_cast_right)
  grid::grid.newpage()
  grid.table(as.data.frame(summary(md_m2_spret_cast_right)$coefficients)[, c(1,5)])

  print(p_spret_hb_m2_right)
  grid::grid.newpage()
  grid.table(as.data.frame(summary(md_spret_hb_m2_right)$coefficients)[, c(1,5)])
  dev.off()
  
  pdf(file.path(fig_dir, "figS12A.pdf"), 6.5, 5.5)
  print(p_hb_spret_cast_right)
  grid::grid.newpage()
  grid.table(as.data.frame(summary(md_hb_spret_cast_right)$coefficients)[, c(1,5)])
  dev.off()
  
  pdf(file.path(fig_dir, "figS12B.pdf"), 6.5, 5.5)
  print(p_cast_hb_m2_right)
  grid::grid.newpage()
  grid.table(as.data.frame(summary(md_cast_hb_m2_right)$coefficients)[, c(1,5)])
  dev.off()
  
  pdf(file.path(fig_dir, "figS12C.pdf"), 6.5, 5.5)
  print(p_cast_m2_right)
  grid::grid.newpage()
  grid.table(as.data.frame(summary(md_cast_m2_right)$coefficients)[, c(1,5)])
  dev.off()

  pdf(file.path(fig_dir, "figS12D.pdf"), 6.5, 5.5)
  print(p_spret_m2_right)
  grid::grid.newpage()
  grid.table(as.data.frame(summary(md_spret_m2_right)$coefficients)[, c(1,5)])
  dev.off()
}
## quartz_off_screen 
##                 2

Break features

Fig 6B, Break size distribution

p1 <- ggplot(bind_rows(hb_spret_df, m2_spret_df) %>% mutate(DISTANCE=DISTANCE / 1e3), aes(DISTANCE)) +
  geom_histogram(binwidth=0.1) +
  scale_x_log10(labels=scales::format_format(scientific = FALSE, digits=0)) +
  xlab("Break point size (kb)") +
  ylab("Count") +
  theme_classic() +
  theme(axis.text = element_text(hjust = 1)) +
  ggtitle("B6/Spret")

p2 <- ggplot(bind_rows(hb_cast_df, m2_cast_df) %>% mutate(DISTANCE=DISTANCE / 1e3), aes(DISTANCE)) +
  geom_histogram(binwidth=0.1) +
  scale_x_log10(labels=scales::format_format(scientific = FALSE, digits=0)) +
  xlab("Break point size (kb)") +
  ylab("Count") +
  theme_classic() +
  theme(axis.text = element_text(hjust = 1)) +
  ggtitle("B6/Cast")

p1

p2

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "fig6B.pdf"), 2.5, 2.5)
  p1
  p2
  dev.off()
}
## quartz_off_screen 
##                 2

Load pre-computed features

load(slfile("features.rda"))

BAS

Fig 6A, B6/Spret + B6/Cast combined data

spret_cast_window_data_df <- as.data.frame(spret_cast_window_data)
spret_cast_window_data_df <- spret_cast_window_data_df[!grepl("^event_", colnames(spret_cast_window_data_df)) | colnames(spret_cast_window_data_df) == "event_cast_spret"]

spret_cast_window_lm <- lm(log1p(event_cast_spret) ~ ., data=spret_cast_window_data_df)

if (!is.na(fig_dir)) {
  sink(file.path(fig_dir, "spret_cast_window_lm_output.txt"))
  print(summary(spret_cast_window_lm))
  sink()
}

set.seed(12345)
spret_cast_window_bas <- bas.lm(log1p(event_cast_spret) ~ ., data=spret_cast_window_data_df, n.models=2^14, method='BAS')
p <- bas_plot(spret_cast_window_bas)

if (!is.na(fig_dir)) {
  ggsave(file.path(fig_dir, "fig6A.pdf"), p, "pdf", width=7, height=12)
  sink(file.path(fig_dir, "bas_spret_cast_coef.txt"))
  print(coef(spret_cast_window_bas))
  sink()
}

Fig S5A, Cast

cast_window_data_df <- as.data.frame(cast_window_data)
cast_window_data_df <- cast_window_data_df[!grepl("^event_", colnames(cast_window_data_df)) | colnames(cast_window_data_df) == "event_hp_m2"]

cast_window_lm <- lm(log1p(event_hp_m2) ~ ., data=cast_window_data_df)

if (!is.na(fig_dir)) {
  sink(file.path(fig_dir, "cast_window_lm_output.txt"))
  print(summary(cast_window_lm))
  sink()
}

set.seed(12345)
cast_window_bas <- bas.lm(log1p(event_hp_m2) ~ ., data=cast_window_data_df, n.models=2^14, method='BAS')
p <- bas_plot(cast_window_bas)

if (!is.na(fig_dir)) {
  ggsave(file.path(fig_dir, "figS9A.pdf"), p, "pdf", width=8, height=9)
  sink(file.path(fig_dir, "bas_cast_coef.txt"))
  print(coef(cast_window_bas))
  sink()
}

Fig S5B, Spret

spret_window_data_df <- as.data.frame(spret_window_data)
spret_window_data_df <- spret_window_data_df[!grepl("^event_", colnames(spret_window_data_df)) | colnames(spret_window_data_df) == "event_hp_m2"]

spret_window_lm <- lm(log1p(event_hp_m2) ~ ., data=spret_window_data_df)

if (!is.na(fig_dir)) {
  sink(file.path(fig_dir, "spret_window_lm_output.txt"))
  print(summary(spret_window_lm))
  sink()
}

set.seed(12345)
spret_window_bas <- bas.lm(log1p(event_hp_m2) ~ ., data=spret_window_data_df, n.models=2^14, method='BAS')
p <- bas_plot(spret_window_bas)

if (!is.na(fig_dir)) {
  ggsave(file.path(fig_dir, "figS9B.pdf"), p, "pdf", width=8, height=9)
  sink(file.path(fig_dir, "bas_spret_coef.txt"))
  print(coef(spret_window_bas))
  sink()
}

Correlation among features

Fig S6A, Spret

window_spret_features_for_cor <- window_spret_with_cor %>%
  dplyr::select(-c(CHROM, START, END, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75)) %>%
  mutate_if(sapply(., is.character), function(x) {ifelse(x == "Yes", 1, 0)})

window_spret_features_cor <- cor(window_spret_features_for_cor, use='pairwise.complete.obs', method='spearman')
dim(window_spret_features_cor)
## [1] 80 80
window_spret_features_order <- corrplot::corrMatOrder(window_spret_features_cor[1:75, 1:75], order = "hclust", hclust.method = "complete")
window_spret_features_order <- c(c(76:80), window_spret_features_order) # put responses on topleft

corrplot::corrplot(window_spret_features_cor[window_spret_features_order, window_spret_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 75)))

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS10.pdf"), 10, 10)
  corrplot::corrplot(window_spret_features_cor[window_spret_features_order, window_spret_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 75)))
  dev.off()
}
## quartz_off_screen 
##                 2

Fig S6B, Cast

window_cast_features_for_cor <- window_cast_with_cor %>%
  select(-c(CHROM, START, END, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75)) %>%
  mutate_if(sapply(., is.character), function(x) {ifelse(x == "Yes", 1, 0)})

window_cast_features_cor <- cor(window_cast_features_for_cor, use='pairwise.complete.obs', method='spearman')
dim(window_cast_features_cor)
## [1] 73 73
window_cast_features_order <- corrplot::corrMatOrder(window_cast_features_cor[1:68, 1:68], order = "hclust", hclust.method = "complete")
window_cast_features_order <- c(c(69:73), window_cast_features_order)

corrplot::corrplot(window_cast_features_cor[window_cast_features_order, window_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 68)))

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "figS11.pdf"), 10, 10)
  corrplot::corrplot(window_cast_features_cor[window_cast_features_order, window_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 68)))
  dev.off()
}
## quartz_off_screen 
##                 2

Not included in manuscript, Spret + Cast

cast_spret_combined_events <- list(
  event_cast_spret = read_tsv(slfile("event_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_cast_spret")),
    event_hp_cast_spret = read_tsv(slfile("haploid_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_hp_cast_spret")),
    event_m2_cast_spret = read_tsv(slfile("m2_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_m2_cast_spret")),
    event_mt_cast_spret = read_tsv(slfile("mt_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_mt_cast_spret")),
    event_me_cast_spret = read_tsv(slfile("me_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_me_cast_spret")))

cast_spret_combined_events_df <- bind_cols(map(cast_spret_combined_events, ~.x[, 4]))

window_spret_cast_features_for_cor <- bind_cols(
  window_spret_features_for_cor %>% dplyr::select(-starts_with("event_")),
  window_cast_features_for_cor[setdiff(colnames(window_cast_features_for_cor), colnames(window_spret_features_for_cor))],
  cast_spret_combined_events_df)

window_spret_cast_features_cor <- cor(window_spret_cast_features_for_cor, use='pairwise.complete.obs', method='spearman')
dim(window_spret_cast_features_cor)
## [1] 86 86
window_spret_cast_features_order <- corrplot::corrMatOrder(window_spret_cast_features_cor[1:81, 1:81], order = "hclust", hclust.method = "complete")
window_spret_cast_features_order <- c(c(82:86), window_spret_cast_features_order)
corrplot::corrplot(window_spret_cast_features_cor[window_spret_cast_features_order, window_spret_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 81)))

if (FALSE) {
  pdf(file.path(fig_dir, "figSSS.pdf"), 10, 10)
  corrplot::corrplot(window_spret_cast_features_cor[window_spret_cast_features_order, window_spret_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 81)))
  dev.off()
}

PCA

plot_cell_pca_simp <- function(cell_pca, cell_df, x="PC1", y="PC2") {
  s <- summary(cell_pca)
  s1 <- s$importance["Proportion of Variance", x] * 100
  s2 <- s$importance["Proportion of Variance", y] * 100
  cell_pca_df <- cell_df %>%
    mutate(PCX=cell_pca$x[, x],
           PCY=cell_pca$x[, y]) %>%
    mutate(TYPE=R.utils::capitalize(TYPE))
  
  p <- ggplot(cell_pca_df, aes(PCX, PCY)) +
      geom_point(aes(color=TYPE), alpha=0.4) +
      scale_color_manual(values=c("Haploid"="dodgerblue", "M2 diploid"="orangered"), name="") +
      xlab(sprintf("%s (%1.2f%%)", x, s1)) +
      ylab(sprintf("%s (%1.2f%%)", y, s2)) +
      theme_classic()
  return(p)
}

plot_cell_pca <- function(cell_pca, cell_df, x="PC1", y="PC2", cols=c()) {
  require(viridis)
  require(gridExtra)
  s <- summary(cell_pca)
  s1 <- s$importance["Proportion of Variance", x] * 100
  s2 <- s$importance["Proportion of Variance", y] * 100
  cell_pca_df <- cell_df %>%
    mutate(PCX=cell_pca$x[, x],
           PCY=cell_pca$x[, y])
  if (length(cols) == 0) cols <- setdiff(names(cell_pca_df)[-1], c("PCX", "PCY"))
  plts <- map(cols, function(c) {
    if (is.numeric(cell_pca_df[[c]])) {
      this_df <- cell_pca_df
      this_df[[c]] <- log1p(cell_pca_df[[c]])
    } else {
      this_df <- cell_pca_df
    }
    ggplot(this_df, aes(PCX, PCY)) +
      geom_point(aes_string(color=c), alpha=0.6) +
      scale_color_viridis(name=gsub("_upd", "_upc", c), discrete=!is.numeric(this_df[[c]])) +
      # last minute change" chr3_upd to chr3_upc because it apparently confused people
      xlab(sprintf("%s (%1.2f%%)", x, s1)) +
      ylab(sprintf("%s (%1.2f%%)", y, s2)) +
      theme_classic() +
      theme(legend.position="bottom",
            legend.title=element_text(size=25),
            legend.text=element_text(size=12),
            axis.title=element_text(size=25))
  })
  return(plts)
}

Fig S7F, Spret

Figure too big so not embedded in html, see png.

spret_cell_pca <- prcomp((spret_cell_df_with_cor[-(1:2)]), scale.=T)

spret_sub_cols <- c("TYPE","chr3_bp","chr1_upd","TELOMERIC","QUANTILE_0_25_FLAG","SPO11_BOTH","SPO11_B6","SPO11_SPRET","SPO11_OTHER","CNV_SPRET_LOSS","CTCF","RNA_POL2","H3K27AC","H3K36ME3","H3K4ME3","LONG_RNA_SEQ","DMRT6","H3K27ME3","PS_ATAC","PS_H3K27AC","PS_COHESIN","PS_SMC1B_LAP","H4K5AC","H4K5BU","H4K8AC","H4K8BU","CTCFL","CTCF_B6","CTCF_SPRET","END_SEQ_B6_ETO","END_SEQ_SPRET_ETO","RAD21_B6","RAD21_SPRET","TOP2A_MEF","TOP2B_MEF","PATSKI_ALLELIC_ATAC_B6","PATSKI_ALLELIC_ATAC_SPRET","PATSKI_ALLELIC_CTCF_B6","PATSKI_ALLELIC_CTCF_SPRET","P7GONIA","GENE","LNC_RNA","KAC","N_MT","N_ME")

if (!is.na(fig_dir)) {
  png(file.path(fig_dir, "figS13.png"), 8000, 6000, res=200)
  do.call("grid.arrange", c(plot_cell_pca(spret_cell_pca, spret_cell_df_with_cor, cols=spret_sub_cols), ncol=8))
  dev.off()
}
## quartz_off_screen 
##                 2

Fig S7G, Cast

Figure too big so not embedded in html, see png.

cast_cell_pca <- prcomp((cast_cell_df_with_cor[-(1:2)]), scale.=T)

cast_sub_cols <- c("TYPE", "chr3_bp","chr1_upd","QUANTILE_25_75_FLAG","CTCF","RNA_POL2","H3K27AC","H3K36ME3","DMRT6","H3K27ME3","PS_H3K27AC","PS_COHESIN","PS_SMC1B_LAP","CTCFL","END_SEQ_B6_ETO","RAD21_B6","PATSKI_ALLELIC_ATAC_B6","PATSKI_ALLELIC_CTCF_B6","CNV_CAST_LOSS","KAC")

if (!is.na(fig_dir)) {
  png(file.path(fig_dir, "figS14.png"), 6000, 4000, res=200)
  do.call("grid.arrange", c(plot_cell_pca(cast_cell_pca, cast_cell_df_with_cor, "PC1", "PC3", cols=cast_sub_cols), ncol=5))
  dev.off()
}
## quartz_off_screen 
##                 2

old Fig 6B

p1 <- plot_cell_pca_simp(spret_cell_pca, spret_cell_df_with_cor, "PC1", "PC2")
p2 <- plot_cell_pca_simp(cast_cell_pca, cast_cell_df_with_cor, "PC1", "PC3")

p1
p2

if (!is.na(fig_dir)) {
  pdf(file.path(fig_dir, "old_fig6B.pdf"), 4, 3)
  print(p1)
  print(p2)
  dev.off()
}

Random forest

Spret, all features

md_spret_data <- bind_rows(hb_spret_df_with_cor %>% add_column(RESPONSE="Y", TYPE="hp", .before=1),
                           m2_spret_df_with_cor %>% add_column(RESPONSE="Y", TYPE="m2", .before=1),
                           hb_spret_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="hp", .before=1),
                           m2_spret_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="m2", .before=1)) %>%
  select(-c(CELL_ID, CHROM, START, END, SCORE, STRAND, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75, NOTE, SEGREGATION))

table(md_spret_data$RESPONSE)
## 
##     N     Y 
## 23785 23785
if (!is.na(fig_dir)) saveRDS(md_spret_data, file.path(fig_dir, "180820_md_spret_data.rds"))

Spret, MIP > 0.5 features

spret_bma_coef <- coef(spret_window_bas)
spret_bma_top_predictors <- setdiff(spret_bma_coef$namesx[spret_bma_coef$probne0 > 0.5], "Intercept")
use_spret_cols <- c('RESPONSE', 'CENTROMERIC', 'TELOMERIC', 'QUANTILE_25_75_FLAG', 'QUANTILE_75_100_FLAG', 'GC', 'SPO11_BOTH', 'CNV_SPRET_GAIN', 'CNV_SPRET_LOSS', 'THREE_PRIME_UTR', 'GENE', 'PSEUDOGENIC_TRANSCRIPT', 'RMSK_DNA', 'RMSK_LINE', 'RMSK_LTR', 'RMSK_LC', 'RMSK_SINE', 'RMSK_SATE', 'SEQ_DIV_SPRET', 'H3K36ME3', 'H3K4ME1', 'DMRT6', 'H3K27ME3', 'CTCFL', 'MESC_REPLI_SEQ', 'TOP2B_MEF', 'PATSKI_ALLELIC_CTCF_B6', 'PATSKI_ALLELIC_CTCF_SPRET', 'P7GONIA')
assertthat::assert_that(setequal(c('RESPONSE', spret_bma_top_predictors), use_spret_cols))
## [1] TRUE
md_spret_sub_data <- md_spret_data[use_spret_cols]
if (!is.na(fig_dir)) saveRDS(md_spret_sub_data, file.path(fig_dir, "180820_md_spret_sub_data.rds"))

Cast, all features

md_cast_data <- bind_rows(hb_cast_df_with_cor %>% add_column(RESPONSE="Y", TYPE="hp", .before=1),
                          m2_cast_df_with_cor %>% add_column(RESPONSE="Y", TYPE="m2", .before=1),
                          hb_cast_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="hp", .before=1),
                          m2_cast_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="m2", .before=1)) %>%
  select(-c(CELL_ID, CHROM, START, END, SCORE, STRAND, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75, NOTE, SEGREGATION)) %>%
  drop_na()

table(md_cast_data$RESPONSE) # slightly different
## 
##     N     Y 
## 62715 63001
if (!is.na(fig_dir)) saveRDS(md_cast_data, file.path(fig_dir, "180820_md_cast_data.rds"))

Cast, MIP > 0.5 features

cast_bma_coef <- coef(cast_window_bas)
cast_bma_top_predictors <- setdiff(cast_bma_coef$namesx[cast_bma_coef$probne0 > 0.5], "Intercept")
use_cast_cols <- c("RESPONSE","CENTROMERIC","TELOMERIC","QUANTILE_0_25_FLAG","QUANTILE_25_75_FLAG","GC","SSDS_F1","SSDS_B6","SSDS_CAST","CNV_CAST_GAIN","THREE_PRIME_UTR","GENE","PSEUDOGENIC_TRANSCRIPT","RMSK_DNA","RMSK_LINE","RMSK_LC","RMSK_SIMPLE_REPEAT","SEQ_DIV_CAST","H3K4ME1","H3K4ME3","DMRT6","H3K27ME3","MESC_REPLI_SEQ","TOP2A_MEF","PATSKI_ALLELIC_CTCF_B6","P7GONIA")

assertthat::assert_that(setequal(c('RESPONSE', cast_bma_top_predictors), use_cast_cols))
## [1] TRUE
md_cast_sub_data <- md_cast_data[use_cast_cols]
if (!is.na(fig_dir)) saveRDS(md_cast_sub_data, file.path(fig_dir, "180820_md_cast_sub_data.rds"))

Run random forest

Run these on a server.

Spret, all features

setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)

md_data <- readRDS("180820_md_spret_data.rds")
set.seed(12345)
spret_rf_ncv <- lapply(1:5, function(x) {
  train_n <- round(nrow(md_data) * 0.9)
  train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
  train_df <- md_data[train_idx, ]
  train_df$RESPONSE <- factor(train_df$RESPONSE)
  test_df <- md_data[-train_idx, ]
  train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
                                classProbs = TRUE, 
                                summaryFunction = twoClassSummary, allowParallel = TRUE)
  model_weights <- ifelse(train_df$RESPONSE == "Y",
                          (1/table(train_df$RESPONSE)["Y"]) * 0.5,
                          (1/table(train_df$RESPONSE)["N"]) * 0.5)
  md <- train(RESPONSE ~ ., data = train_df, 
              method = "rf", 
              trControl = train_control,
              weights = model_weights,
              metric = "ROC",
              na.action = "na.omit")
  pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
  pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
  ret <- list("md"=md,
              "pred"=pred,
              "pred_df"=pred_df)
  return(ret)
})

spret_rf_ncv_pred_df <- map_df(spret_rf_ncv, ~.x$pred_df)
saveRDS(spret_rf_ncv_pred_df, "180820_spret_rf_ncv_pred_df.rds")
saveRDS(spret_rf_ncv, "180820_spret_rf_ncv.rds")

Spret, top features

setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)

md_data <- readRDS("180820_md_spret_sub_data.rds")
set.seed(12345)
spret_sub_rf_ncv <- lapply(1:5, function(x) {
  train_n <- round(nrow(md_data) * 0.9)
  train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
  train_df <- md_data[train_idx, ]
  train_df$RESPONSE <- factor(train_df$RESPONSE)
  test_df <- md_data[-train_idx, ]
  train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
                                classProbs = TRUE, 
                                summaryFunction = twoClassSummary, allowParallel = TRUE)
  model_weights <- ifelse(train_df$RESPONSE == "Y",
                          (1/table(train_df$RESPONSE)["Y"]) * 0.5,
                          (1/table(train_df$RESPONSE)["N"]) * 0.5)
  md <- train(RESPONSE ~ ., data = train_df, 
              method = "rf", 
              trControl = train_control,
              weights = model_weights,
              metric = "ROC",
              na.action = "na.omit")
  pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
  pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
  ret <- list("md"=md,
              "pred"=pred,
              "pred_df"=pred_df)
  return(ret)
})

spret_sub_rf_ncv_pred_df <- map_df(spret_sub_rf_ncv, ~.x$pred_df)
saveRDS(spret_sub_rf_ncv_pred_df, "180820_spret_sub_rf_ncv_pred_df.rds")
saveRDS(spret_sub_rf_ncv, "180820_spret_sub_rf_ncv.rds")

Cast, all features

setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)

md_data <- readRDS("180820_md_cast_data.rds")
set.seed(12345)
cast_rf_ncv <- lapply(1:5, function(x) {
  train_n <- round(nrow(md_data) * 0.9)
  train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
  train_df <- md_data[train_idx, ]
  train_df$RESPONSE <- factor(train_df$RESPONSE)
  test_df <- md_data[-train_idx, ]
  train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
                                classProbs = TRUE, 
                                summaryFunction = twoClassSummary, allowParallel = TRUE)
  model_weights <- ifelse(train_df$RESPONSE == "Y",
                          (1/table(train_df$RESPONSE)["Y"]) * 0.5,
                          (1/table(train_df$RESPONSE)["N"]) * 0.5)
  md <- train(RESPONSE ~ ., data = train_df, 
              method = "rf", 
              trControl = train_control,
              weights = model_weights,
              metric = "ROC",
              na.action = "na.omit")
  pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
  pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
  ret <- list("md"=md,
              "pred"=pred,
              "pred_df"=pred_df)
  return(ret)
})

cast_rf_ncv_pred_df <- map_df(cast_rf_ncv, ~.x$pred_df)
saveRDS(cast_rf_ncv_pred_df, "180820_cast_rf_ncv_pred_df.rds")
saveRDS(cast_rf_ncv, "180820_cast_rf_ncv.rds")

Cast, top features

setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)

md_data <- readRDS("180820_md_cast_sub_data.rds")
set.seed(12345)
cast_sub_rf_ncv <- lapply(1:5, function(x) {
  train_n <- round(nrow(md_data) * 0.9)
  train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
  train_df <- md_data[train_idx, ]
  train_df$RESPONSE <- factor(train_df$RESPONSE)
  test_df <- md_data[-train_idx, ]
  train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
                                classProbs = TRUE, 
                                summaryFunction = twoClassSummary, allowParallel = TRUE)
  model_weights <- ifelse(train_df$RESPONSE == "Y",
                          (1/table(train_df$RESPONSE)["Y"]) * 0.5,
                          (1/table(train_df$RESPONSE)["N"]) * 0.5)
  md <- train(RESPONSE ~ ., data = train_df, 
              method = "rf", 
              trControl = train_control,
              weights = model_weights,
              metric = "ROC",
              na.action = "na.omit")
  pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
  pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
  ret <- list("md"=md,
              "pred"=pred,
              "pred_df"=pred_df)
  return(ret)
})

cast_sub_rf_ncv_pred_df <- map_df(cast_sub_rf_ncv, ~.x$pred_df)
saveRDS(cast_sub_rf_ncv_pred_df, "180820_cast_sub_rf_ncv_pred_df.rds")
saveRDS(cast_sub_rf_ncv, "180820_cast_sub_rf_ncv.rds")

Fig 6E, 6F, ROC and AUC from pre-generated random forests

spret_rf_ncv_pred_df <- readRDS(slfile("180820_spret_rf_ncv_pred_df.rds"))
spret_sub_rf_ncv_pred_df <- readRDS(slfile("180820_spret_sub_rf_ncv_pred_df.rds"))
cast_rf_ncv_pred_df <- readRDS(slfile("180820_cast_rf_ncv_pred_df.rds"))
cast_sub_rf_ncv_pred_df <- readRDS(slfile("180820_cast_sub_rf_ncv_pred_df.rds"))

spret_rf_roc <- roc(truth ~ pred, spret_rf_ncv_pred_df)
spret_sub_rf_roc <- roc(truth ~ pred, spret_sub_rf_ncv_pred_df)
cast_rf_roc <- roc(truth ~ pred, cast_rf_ncv_pred_df)
cast_sub_rf_roc <- roc(truth ~ pred, cast_sub_rf_ncv_pred_df)

spret_rf_roc
## 
## Call:
## roc.formula(formula = truth ~ pred, data = spret_rf_ncv_pred_df)
## 
## Data: pred in 11871 controls (truth N) < 11914 cases (truth Y).
## Area under the curve: 0.7396
spret_sub_rf_roc
## 
## Call:
## roc.formula(formula = truth ~ pred, data = spret_sub_rf_ncv_pred_df)
## 
## Data: pred in 11918 controls (truth N) < 11867 cases (truth Y).
## Area under the curve: 0.7209
cast_rf_roc
## 
## Call:
## roc.formula(formula = truth ~ pred, data = cast_rf_ncv_pred_df)
## 
## Data: pred in 31549 controls (truth N) < 31311 cases (truth Y).
## Area under the curve: 0.8471
cast_sub_rf_roc
## 
## Call:
## roc.formula(formula = truth ~ pred, data = cast_sub_rf_ncv_pred_df)
## 
## Data: pred in 31290 controls (truth N) < 31570 cases (truth Y).
## Area under the curve: 0.8506
ncv_roc_df <- bind_rows(
  tibble(Sensitivity=spret_rf_roc$sensitivities, Specificity=spret_rf_roc$specificities) %>%
    mutate(Strain="B6/Spret", Predictors="All features"),
  tibble(Sensitivity=spret_sub_rf_roc$sensitivities, Specificity=spret_sub_rf_roc$specificities) %>%
    mutate(Strain="B6/Spret", Predictors="Top BMA features"),
  tibble(Sensitivity=cast_rf_roc$sensitivities, Specificity=cast_rf_roc$specificities) %>%
    mutate(Strain="B6/Cast", Predictors="All features"),
  tibble(Sensitivity=cast_sub_rf_roc$sensitivities, Specificity=cast_sub_rf_roc$specificities) %>%
    mutate(Strain="B6/Cast", Predictors="Top BMA features")
)

p1 <- ggplot(ncv_roc_df %>% filter(Strain == "B6/Spret"), aes(Specificity, Sensitivity)) +
  geom_segment(x=0, y=1, xend=-1, yend=0, color="grey50", size=0.3) +
  geom_line(color="orangered") +
  facet_wrap(~Predictors, nrow=1) +
  xlim(1, 0) +
  ylim(0, 1) +
  theme_classic()

p2 <- ggplot(ncv_roc_df %>% filter(Strain == "B6/Cast"), aes(Specificity, Sensitivity)) +
  geom_segment(x=0, y=1, xend=-1, yend=0, color="grey50", size=0.3) +
  geom_line(color="orangered") +
  facet_wrap(~Predictors, nrow=1) +
  xlim(1, 0) +
  ylim(0, 1) +
  theme_classic()

p1

p2

if (!is.na(fig_dir)) {
  ggsave("fig6E.pdf", p1, "pdf", fig_dir, width=6, height=2.3)
  ggsave("fig6F.pdf", p2, "pdf", fig_dir, width=6, height=2.3)
}